From 32a0843a618d592def575eb726996b532f32c2b3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9gis=20Behmo?= <regis@behmo.com>
Date: Sun, 5 Jul 2020 16:04:04 +0200
Subject: [PATCH 1/3] Use pre-compiled regular expressions for clarity

Having pre-compiled regular expressions with verbose mode allows us to
write clearer regexes with comments.
---
 code_annotations/extensions/base.py | 10 ++++++----
 code_annotations/find_django.py     |  3 +--
 code_annotations/helpers.py         | 22 +++++++++++-----------
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/code_annotations/extensions/base.py b/code_annotations/extensions/base.py
index 6dcb470..6b0af26 100644
--- a/code_annotations/extensions/base.py
+++ b/code_annotations/extensions/base.py
@@ -88,14 +88,16 @@ def __init__(self, config, echo):
             raise ValueError('Subclasses of SimpleRegexAnnotationExtension must define lang_comment_definition!')
 
         # pylint: disable=not-a-mapping
-        self.comment_regex = self.comment_regex_fmt.format(**self.lang_comment_definition)
+        self.comment_regex = re.compile(
+            self.comment_regex_fmt.format(**self.lang_comment_definition)
+        )
 
         # Parent class will allow this class to populate self.strings_to_search via
         # calls to _add_annotation_token or _add_annotation_group for each configured
         # annotation.
         self.query = get_annotation_regex(self.config.annotation_regexes)
 
-        self.ECHO.echo_v("{} extension regex query: {}".format(self.extension_name, self.query))
+        self.ECHO.echo_v("{} extension regex query: {}".format(self.extension_name, self.query.pattern))
 
     def search(self, file_handle):
         """
@@ -115,10 +117,10 @@ def search(self, file_handle):
         if any(anno in txt for anno in self.config.annotation_tokens):
             fname = clean_abs_path(file_handle.name, self.config.source_path)
 
-            for match in re.finditer(self.comment_regex, txt):
+            for match in self.comment_regex.finditer(txt):
                 # Should only be one match
                 comment_content = [item for item in match.groups() if item is not None][0]
-                for inner_match in re.finditer(self.query, comment_content):
+                for inner_match in self.query.finditer(comment_content):
                     # Get the line number by counting newlines + 1 (for the first line).
                     # Note that this is the line number of the beginning of the comment, not the
                     # annotation token itself.
diff --git a/code_annotations/find_django.py b/code_annotations/find_django.py
index 225fa06..ac7c81f 100644
--- a/code_annotations/find_django.py
+++ b/code_annotations/find_django.py
@@ -4,7 +4,6 @@
 import inspect
 import os
 import pprint
-import re
 import sys
 
 import django
@@ -109,7 +108,7 @@ def _append_model_annotations(self, model_type, model_id, query, model_annotatio
         with open(filename, 'r') as file_handle:
             txt = file_handle.read()
 
-        for inner_match in re.finditer(query, model_type.__doc__):
+        for inner_match in query.finditer(model_type.__doc__):
             # TODO: This is duplicated code with extensions/base.py
             # No matter how long the regex is, there should only be 2 non-None items,
             # with the first being the annotation token and the 2nd being the comment.
diff --git a/code_annotations/helpers.py b/code_annotations/helpers.py
index 1205098..c95e305 100644
--- a/code_annotations/helpers.py
+++ b/code_annotations/helpers.py
@@ -2,6 +2,7 @@
 Helpers for code_annotations scripts.
 """
 import os
+import re
 import sys
 
 import click
@@ -112,6 +113,7 @@ def clean_abs_path(filename_to_clean, parent_path):
 def get_annotation_regex(annotation_regexes):
     """
     Return the full regex to search inside comments for configured annotations.
+    A match against the regex will returns a 2-tuple of found annotation token and annotation comment
 
     Args:
         annotation_regexes: List of re.escaped annotation tokens to search for.
@@ -123,15 +125,13 @@ def get_annotation_regex(annotation_regexes):
     r"""
     This format string/regex finds our annotation token and choices / comments inside a comment:
 
-    [\s\S]*? - Strip out any characters between the start of the comment and the annotation
-    ({})     - {} is a Python format string that will be replaced with a regex escaped and
-               then or-joined to make a list of the annotation tokens we're looking for
-               Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
-    (.*)     - and capture all characters until the end of the line
-
-    Returns a 2-tuple of found annotation token and annotation comment
-
-    TODO: Make multi line annotation comments work again.
     """
-    annotation_regex = r'[\s\S]*?({})(.*)'
-    return annotation_regex.format('|'.join(annotation_regexes))
+    annotation_regex = r"""
+    [\s\S]*?                   # Strip out any characters between the start of the comment and the annotation
+    ({})                       # Python format string that will be replaced with a regex escaped and
+                               # then or-joined to make a list of the annotation tokens we're looking for
+                               # Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
+    (.*)                       # capture all characters until the end of the line
+    """
+    annotation_regex = annotation_regex.format('|'.join(annotation_regexes))
+    return re.compile(annotation_regex, flags=re.VERBOSE)

From bb3f1aa7a505c3d1c9197810ad949df040726a8c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9gis=20Behmo?= <regis@behmo.com>
Date: Tue, 21 Jul 2020 15:39:05 +0200
Subject: [PATCH 2/3] Fix "writing annotation" configuration example

In all examples from this documentation page, the `fun_fact` token is
followed by only a single ":".
---
 docs/writing_annotations.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/writing_annotations.rst b/docs/writing_annotations.rst
index 7689e4c..5053657 100644
--- a/docs/writing_annotations.rst
+++ b/docs/writing_annotations.rst
@@ -26,7 +26,7 @@ Configuration for a "fun fact" annotation type, denoted by the annotation token
 .. code-block:: yaml
 
     annotations:
-        ".. fun_fact::":
+        ".. fun_fact:":
 
 There are no choices given, so this is a free form comment type of annotation. Note the trailing colon! It would be used
 in Python like this:

From 4cfcf960467f9d5a4b779086660574b3892a157b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9gis=20Behmo?= <regis@behmo.com>
Date: Thu, 2 Jul 2020 17:06:48 +0200
Subject: [PATCH 3/3] [BD-21] Implement multi-line code annotations

Make it possible (again) to write multi-line code annotations, such as:

    .. pi: This is an annotation that
      spans multiple lines and allows developers to
      write more extensive docs.

The only problem with multi-line annotations is that we did not find a
way to get rid of empty spaces that prefix every new line after the
first.
---
 CHANGELOG.rst                                 |  5 ++
 code_annotations/__init__.py                  |  2 +-
 code_annotations/extensions/base.py           | 39 ++++---------
 code_annotations/find_django.py               | 39 ++++++-------
 code_annotations/helpers.py                   | 58 +++++++++++++++----
 docs/writing_annotations.rst                  | 30 +++++++++-
 .../multiline_empty_first_line.pyt            |  7 +++
 .../python_test_files/multiline_indented.pyt  |  6 ++
 .../multiline_paragraphs.pyt                  | 15 +++++
 .../python_test_files/multiline_simple.pyt    |  8 +++
 tests/extensions/test_extension_python.py     | 57 ++++++++++++++++++
 11 files changed, 202 insertions(+), 64 deletions(-)
 create mode 100644 tests/extensions/python_test_files/multiline_empty_first_line.pyt
 create mode 100644 tests/extensions/python_test_files/multiline_indented.pyt
 create mode 100644 tests/extensions/python_test_files/multiline_paragraphs.pyt
 create mode 100644 tests/extensions/python_test_files/multiline_simple.pyt

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index de9a842..4da95bd 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -16,6 +16,11 @@ Unreleased
 
 *
 
+[0.4.0] - 2020-07-22
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Add support for multi-line code annotations
+
 [0.3.4] - 2020-05-06
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/code_annotations/__init__.py b/code_annotations/__init__.py
index 6152c1e..d45c92b 100644
--- a/code_annotations/__init__.py
+++ b/code_annotations/__init__.py
@@ -2,4 +2,4 @@
 Extensible tools for parsing annotations in codebases.
 """
 
-__version__ = '0.3.4'
+__version__ = '0.4.0'
diff --git a/code_annotations/extensions/base.py b/code_annotations/extensions/base.py
index 6b0af26..69fe36d 100644
--- a/code_annotations/extensions/base.py
+++ b/code_annotations/extensions/base.py
@@ -4,7 +4,7 @@
 import re
 from abc import ABCMeta, abstractmethod
 
-from code_annotations.helpers import clean_abs_path, get_annotation_regex
+from code_annotations.helpers import clean_abs_path, clean_annotation, get_annotation_regex
 
 
 class AnnotationExtension(object, metaclass=ABCMeta):
@@ -56,24 +56,9 @@ class SimpleRegexAnnotationExtension(AnnotationExtension, metaclass=ABCMeta):
     Returns a 2-tuple of:
      - ("Comment text", None) in the case of a multi-line comment OR
      - (None, "Comment text") in the case of a single-line comment
-
-    TODO: Make this handle multi-line annotation comments again.
     """
     comment_regex_fmt = r'{multi_start}([\d\D]*?){multi_end}|{single}(.*)'
 
-    r"""
-    This format string/regex finds our annotation token and choices / comments inside a comment:
-
-    [\s\S]*? - Strip out any characters between the start of the comment and the annotation
-    ({})     - {} is a Python format string that will be replaced with a regex escaped and
-               then or-joined to make a list of the annotation tokens we're looking for
-               Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
-    (.*)     - and capture all characters until the end of the line
-
-    Returns a 2-tuple of found annotation token and annotation comment
-
-    TODO: Make multi line annotation comments work again.
-    """
     def __init__(self, config, echo):
         """
         Set up the extension and create the regexes used to do searches.
@@ -126,25 +111,23 @@ def search(self, file_handle):
                     # annotation token itself.
                     line = txt.count('\n', 0, match.start()) + 1
 
-                    # No matter how long the regex is, there should only be 2 non-None items,
-                    # with the first being the annotation token and the 2nd being the comment.
-                    cleaned_groups = [item for item in inner_match.groups() if item is not None]
-
-                    if len(cleaned_groups) != 2:  # pragma: no cover
-                        raise Exception('{}::{}: Number of found items in the list is not 2. Found: {}'.format(
+                    try:
+                        annotation_token = inner_match.group('token')
+                        annotation_data = inner_match.group('data')
+                    except IndexError:
+                        # pragma: no cover
+                        raise ValueError('{}::{}: Could not find "data" or "token" groups. Found: {}'.format(
                             fname,
                             line,
-                            cleaned_groups
+                            inner_match.groupdict()
                         ))
-
-                    annotation, comment = cleaned_groups
-
+                    annotation_token, annotation_data = clean_annotation(annotation_token, annotation_data)
                     found_annotations.append({
                         'found_by': self.extension_name,
                         'filename': fname,
                         'line_number': line,
-                        'annotation_token': annotation.strip(),
-                        'annotation_data': comment.strip()
+                        'annotation_token': annotation_token,
+                        'annotation_data': annotation_data,
                     })
 
         return found_annotations
diff --git a/code_annotations/find_django.py b/code_annotations/find_django.py
index ac7c81f..f960c8e 100644
--- a/code_annotations/find_django.py
+++ b/code_annotations/find_django.py
@@ -12,7 +12,7 @@
 from django.db import models
 
 from code_annotations.base import BaseSearch
-from code_annotations.helpers import fail, get_annotation_regex
+from code_annotations.helpers import clean_annotation, fail, get_annotation_regex
 
 DEFAULT_SAFELIST_FILE_PATH = '.annotation_safe_list.yml'
 
@@ -108,33 +108,30 @@ def _append_model_annotations(self, model_type, model_id, query, model_annotatio
         with open(filename, 'r') as file_handle:
             txt = file_handle.read()
 
-        for inner_match in query.finditer(model_type.__doc__):
-            # TODO: This is duplicated code with extensions/base.py
-            # No matter how long the regex is, there should only be 2 non-None items,
-            # with the first being the annotation token and the 2nd being the comment.
-            cleaned_groups = [item for item in inner_match.groups() if item is not None]
+        # Get the line number by counting newlines + 1 (for the first line).
+        # Note that this is the line number of the beginning of the comment, not the
+        # annotation token itself. We find based on the entire code content of the model
+        # as that seems to be the only way to be sure we're getting the correct line number.
+        # It is slow and should be replaced if we can find a better way that is accurate.
+        line = txt.count('\n', 0, txt.find(inspect.getsource(model_type))) + 1
 
-            if len(cleaned_groups) != 2:  # pragma: no cover
-                raise Exception('{}: Number of found items in the list is not 2. Found: {}'.format(
+        for inner_match in query.finditer(model_type.__doc__):
+            try:
+                annotation_token = inner_match.group('token')
+                annotation_data = inner_match.group('data')
+            except IndexError:
+                # pragma: no cover
+                raise ValueError('{}: Could not find "data" or "token" groups. Found: {}'.format(
                     self.get_model_id(model_type),
-                    cleaned_groups
+                    inner_match.groupdict()
                 ))
-
-            annotation, comment = cleaned_groups
-
-            # Get the line number by counting newlines + 1 (for the first line).
-            # Note that this is the line number of the beginning of the comment, not the
-            # annotation token itself. We find based on the entire code content of the model
-            # as that seems to be the only way to be sure we're getting the correct line number.
-            # It is slow and should be replaced if we can find a better way that is accurate.
-            line = txt.count('\n', 0, txt.find(inspect.getsource(model_type))) + 1
-
+            annotation_token, annotation_data = clean_annotation(annotation_token, annotation_data)
             model_annotations.append({
                 'found_by': "django",
                 'filename': filename,
                 'line_number': line,
-                'annotation_token': annotation.strip(),
-                'annotation_data': comment.strip(),
+                'annotation_token': annotation_token,
+                'annotation_data': annotation_data,
                 'extra': {
                     'object_id': model_id,
                     'full_comment': model_type.__doc__.strip()
diff --git a/code_annotations/helpers.py b/code_annotations/helpers.py
index c95e305..caa5f94 100644
--- a/code_annotations/helpers.py
+++ b/code_annotations/helpers.py
@@ -113,25 +113,59 @@ def clean_abs_path(filename_to_clean, parent_path):
 def get_annotation_regex(annotation_regexes):
     """
     Return the full regex to search inside comments for configured annotations.
-    A match against the regex will returns a 2-tuple of found annotation token and annotation comment
+
+    A successful match against the regex will return two groups of interest: 'token'
+    and 'data'.
+
+    This regular expression supports annotation tokens that span multiple lines. To do
+    so, prefix each line after the first by at least two leading spaces. E.g:
+
+        .. pii: First line
+          second line
+
+    Unfortunately, the indenting spaces will find their way to the content of the "token" group.
 
     Args:
         annotation_regexes: List of re.escaped annotation tokens to search for.
 
     Returns:
         Regex ready for searching comments for annotations.
-    """
-    # pylint: disable=pointless-string-statement
-    r"""
-    This format string/regex finds our annotation token and choices / comments inside a comment:
-
     """
     annotation_regex = r"""
-    [\s\S]*?                   # Strip out any characters between the start of the comment and the annotation
-    ({})                       # Python format string that will be replaced with a regex escaped and
-                               # then or-joined to make a list of the annotation tokens we're looking for
-                               # Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
-    (.*)                       # capture all characters until the end of the line
+    (?P<space>[\ \t]*)               # Leading empty spaces
+    (?P<token>{tokens})              # Python format string that will be replaced with a
+                                     # regex, escaped and then or-joined to make a list
+                                     # of the annotation tokens we're looking for
+                                     # Ex: (\.\.\ pii\:\:|\.\.\ pii\_types\:\:)
+    (?P<data>                        # Captured annotation data
+        (?:                          # non-capture mode
+            .                        # any non-newline character
+            |                        # or new line of multi-line annotation data
+            (?:                      # non-capture mode
+                \n{{1,}}             # at least one newline,
+                (?P=space)           # followed by as much space as the prefix,
+                (?P<indent>\ {{2,}}) # at least two spaces,
+                (?=[^\ ])            # and a non-space character (look-ahead)
+                (?!{tokens})         # that does not match any of the token regexes
+            )                        #
+        )*                           # any number of times
+    )
     """
-    annotation_regex = annotation_regex.format('|'.join(annotation_regexes))
+    annotation_regex = annotation_regex.format(tokens='|'.join(annotation_regexes))
     return re.compile(annotation_regex, flags=re.VERBOSE)
+
+
+def clean_annotation(token, data):
+    """
+    Clean annotation token and data by stripping all trailing/prefix empty spaces.
+
+    Args:
+        token (str)
+        data (str)
+
+    Returns:
+        (str, str): Tuple of cleaned token, data
+    """
+    token = token.strip()
+    data = data.strip()
+    return token, data
diff --git a/docs/writing_annotations.rst b/docs/writing_annotations.rst
index 5053657..e1c8a5d 100644
--- a/docs/writing_annotations.rst
+++ b/docs/writing_annotations.rst
@@ -13,8 +13,10 @@ comments into two parts- the annotation token, and the annotation data.
 
 - Annotation data
     Annotation data can either be a simple free text comment that is on the same line as the token, or a choice list.
-    The choices in a choice list are configured in the configuration file and can be separated by spaces or commas when
-    used in comments. As such, the choices themselves should not contain spaces or commas.
+    Free text annotations can span multiple lines, provided all lines after the first
+    are indented by at least two spaces. The choices in a choice list are configured in
+    the configuration file and can be separated by spaces or commas when used in
+    comments. As such, the choices themselves should not contain spaces or commas.
 
 The information below applies to both the Static Search and Django Model Search tools, with the exception that the
 Django Model Search only looks in model docstrings.
@@ -51,6 +53,30 @@ When a report is run against this code an entry like this will be generated in t
 
 *Note that the rest of the comment is ignored in the report.*
 
+An annotation can also span multiple lines. For instance:
+
+.. code-block:: python
+
+    """
+        This function handles setting the price on an item in the database.
+
+        .. fun_fact: This code is the only remaining piece of our first commit!
+          To write long texts, prepend at least two additional spaces at the start
+          of every line after the first.
+    """
+
+This code would result in the following report:
+
+.. code-block:: yaml
+
+    - annotation_data: "This code is the only remaining piece of our first commit!\n \
+        \     To write long texts, prepend at least two additional spaces at the start\n\
+        \      of every line after the first."
+      annotation_token: '.. fun_fact:'
+      filename: foo/bar/something.py
+      found_by: python
+      line_number: 1
+
 Configuration for an "async" annotation type, denoted by the annotation token ``.. async:`` and choices denoting the
 types of asynchronous processors hooked up to it:
 
diff --git a/tests/extensions/python_test_files/multiline_empty_first_line.pyt b/tests/extensions/python_test_files/multiline_empty_first_line.pyt
new file mode 100644
index 0000000..8b51749
--- /dev/null
+++ b/tests/extensions/python_test_files/multiline_empty_first_line.pyt
@@ -0,0 +1,7 @@
+"""
+.. pii:
+  This is an annotation that
+  spans multiple lines and allows developers to
+  write more extensive docs.
+Comment after annotation and being annotated
+"""
diff --git a/tests/extensions/python_test_files/multiline_indented.pyt b/tests/extensions/python_test_files/multiline_indented.pyt
new file mode 100644
index 0000000..1096233
--- /dev/null
+++ b/tests/extensions/python_test_files/multiline_indented.pyt
@@ -0,0 +1,6 @@
+"""
+    .. pii: A long description that
+        spans multiple indented
+        lines
+    .. pii_types: id, name
+"""
\ No newline at end of file
diff --git a/tests/extensions/python_test_files/multiline_paragraphs.pyt b/tests/extensions/python_test_files/multiline_paragraphs.pyt
new file mode 100644
index 0000000..c00b157
--- /dev/null
+++ b/tests/extensions/python_test_files/multiline_paragraphs.pyt
@@ -0,0 +1,15 @@
+"""
+.. pii: This is an annotation that
+  spans multiple paragraphs.
+
+  This allows developers to write even more
+  extensive docs.
+Comment after annotation and being annotated
+"""
+
+"""
+Docstring
+.. pii: Annotation 1 with:
+
+     Multi-line and multi-paragraph.
+"""
\ No newline at end of file
diff --git a/tests/extensions/python_test_files/multiline_simple.pyt b/tests/extensions/python_test_files/multiline_simple.pyt
new file mode 100644
index 0000000..dfee327
--- /dev/null
+++ b/tests/extensions/python_test_files/multiline_simple.pyt
@@ -0,0 +1,8 @@
+"""
+Docstring
+
+.. pii: A long description that
+  spans multiple
+  lines
+.. pii_types: id, name
+"""
diff --git a/tests/extensions/test_extension_python.py b/tests/extensions/test_extension_python.py
index a0f046e..b41413b 100644
--- a/tests/extensions/test_extension_python.py
+++ b/tests/extensions/test_extension_python.py
@@ -3,6 +3,9 @@
 """
 import pytest
 
+from code_annotations.base import AnnotationConfig
+from code_annotations.extensions.python import PythonAnnotationExtension
+from code_annotations.helpers import VerboseEcho
 from tests.helpers import EXIT_CODE_FAILURE, EXIT_CODE_SUCCESS, call_script
 
 
@@ -31,3 +34,57 @@ def test_grouping_and_choice_failures(test_file, expected_exit_code, expected_me
 
     if expected_exit_code == EXIT_CODE_FAILURE:
         assert "Search failed due to linting errors!" in result.output
+
+
+@pytest.mark.parametrize('test_file,annotations', [
+    (
+        'multiline_simple.pyt',
+        [
+            ('.. pii:', """A long description that
+  spans multiple
+  lines"""),
+            ('.. pii_types:', 'id, name'),
+        ]
+    ),
+    (
+        'multiline_indented.pyt',
+        [
+            ('.. pii:', """A long description that
+        spans multiple indented
+        lines"""),
+            ('.. pii_types:', 'id, name'),
+        ]
+    ),
+    (
+        'multiline_empty_first_line.pyt',
+        [
+            ('.. pii:', """This is an annotation that
+  spans multiple lines and allows developers to
+  write more extensive docs."""),
+        ]
+    ),
+    (
+        'multiline_paragraphs.pyt',
+        [
+            ('.. pii:', """This is an annotation that
+  spans multiple paragraphs.
+
+  This allows developers to write even more
+  extensive docs."""),
+            ('.. pii:', """Annotation 1 with:
+
+     Multi-line and multi-paragraph.""")
+        ]
+    ),
+])
+def test_multi_line_annotations(test_file, annotations):
+    config = AnnotationConfig('tests/test_configurations/.annotations_test')
+    annotator = PythonAnnotationExtension(config, VerboseEcho())
+
+    with open('tests/extensions/python_test_files/{}'.format(test_file)) as fi:
+        result_annotations = annotator.search(fi)
+
+    assert len(annotations) == len(result_annotations)
+    for annotation, result_annotation in zip(annotations, result_annotations):
+        assert result_annotation['annotation_token'] == annotation[0]
+        assert result_annotation['annotation_data'] == annotation[1]