From d6865179025813ff7771c98f839f9270d7fee30e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9gis=20Behmo?= <regis@behmo.com>
Date: Tue, 25 Aug 2020 20:10:58 +0200
Subject: [PATCH 1/2] [BD-21] Cleaner comment parsing

---
 code_annotations/extensions/base.py | 41 ++++++++++++++++-------------
 1 file changed, 22 insertions(+), 19 deletions(-)

diff --git a/code_annotations/extensions/base.py b/code_annotations/extensions/base.py
index 69fe36d..186f846 100644
--- a/code_annotations/extensions/base.py
+++ b/code_annotations/extensions/base.py
@@ -42,22 +42,24 @@ class SimpleRegexAnnotationExtension(AnnotationExtension, metaclass=ABCMeta):
     # Javascript and Python extensions for examples.
     lang_comment_definition = None
 
-    r"""
+    """
     This format string/regex finds all comments in the file. The format tokens will be replaced with the
     language-specific comment definitions defined in the sub-classes.
 
-    {multi_start} - start of the language-specific multi-line comment (ex. /*)
-    ([\d\D]*?)    - capture all of the characters...
-    {multi_end}   - until you find the end of the language-specific multi-line comment (ex. */)
-    |             - If you don't find any of those...
-    {single}      - start by finding the single-line comment token (ex. //)
-    (.*)          - and capture all characters until the end of the line
-
-    Returns a 2-tuple of:
-     - ("Comment text", None) in the case of a multi-line comment OR
-     - (None, "Comment text") in the case of a single-line comment
+    Returns two named values: multiline_comment and singleline_comment.
+    """
+    comment_regex_fmt = r"""
+    {multi_start}   # start of the language-specific multi-line comment (ex. /*)
+    (?P<multiline_comment>
+        [\d\D]*?    # capture all of the characters...
+    )
+    {multi_end}     # until you find the end of the language-specific multi-line comment (ex. */)
+    |               # If you don't find any of those...
+    {single}        # start by finding the single-line comment token (ex. //)
+    (?P<singleline_comment>
+    .*              # and capture all characters until the end of the line
+    )
     """
-    comment_regex_fmt = r'{multi_start}([\d\D]*?){multi_end}|{single}(.*)'
 
     def __init__(self, config, echo):
         """
@@ -74,7 +76,8 @@ def __init__(self, config, echo):
 
         # pylint: disable=not-a-mapping
         self.comment_regex = re.compile(
-            self.comment_regex_fmt.format(**self.lang_comment_definition)
+            self.comment_regex_fmt.format(**self.lang_comment_definition),
+            flags=re.VERBOSE
         )
 
         # Parent class will allow this class to populate self.strings_to_search via
@@ -102,15 +105,15 @@ def search(self, file_handle):
         if any(anno in txt for anno in self.config.annotation_tokens):
             fname = clean_abs_path(file_handle.name, self.config.source_path)
 
+            # Iterate on all comments: both multi- and single-line.
             for match in self.comment_regex.finditer(txt):
+                # Get the line number by counting newlines + 1 (for the first line).
+                # Note that this is the line number of the beginning of the comment, not the
+                # annotation token itself.
+                line = txt.count('\n', 0, match.start()) + 1
                 # Should only be one match
-                comment_content = [item for item in match.groups() if item is not None][0]
+                comment_content = match.groupdict()["multiline_comment"] or match.groupdict()["singleline_comment"]
                 for inner_match in self.query.finditer(comment_content):
-                    # Get the line number by counting newlines + 1 (for the first line).
-                    # Note that this is the line number of the beginning of the comment, not the
-                    # annotation token itself.
-                    line = txt.count('\n', 0, match.start()) + 1
-
                     try:
                         annotation_token = inner_match.group('token')
                         annotation_data = inner_match.group('data')

From 097db99c595bfb69873fdff7d80be697dd3f7934 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9gis=20Behmo?= <regis@behmo.com>
Date: Thu, 27 Aug 2020 16:07:53 +0200
Subject: [PATCH 2/2] Add support for multiline annotations with single-line
 prefix ("#")

Multiline annotations were previously supported only for multi-line
comments. In Python: """..."""

This introduces multiline annotations for comments prefixed by
single-line comment signs. In Python:

    # .. pii: a multiline annotation
    #   that spans multiple lines.

This makes it possible to wrap long comment lines more naturally, in
particular in Python.
---
 CHANGELOG.rst                                 |  5 ++
 code_annotations/__init__.py                  |  2 +-
 code_annotations/extensions/base.py           | 69 ++++++++++++++-----
 .../multiline_singlelinecomment.pyt           |  7 ++
 tests/extensions/test_base_extensions.py      | 16 +++++
 tests/extensions/test_extension_python.py     |  9 +++
 6 files changed, 88 insertions(+), 20 deletions(-)
 create mode 100644 tests/extensions/python_test_files/multiline_singlelinecomment.pyt

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 06dabff..65cca76 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -11,6 +11,11 @@ Change Log
 
 .. There should always be an "Unreleased" section for changes pending release.
 
+[0.6.0] - 2020-08-27
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+* Add support for multiline annotations for lines prefixed with single-line comment signs ("#")
+
 [0.5.1] - 2020-08-25
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/code_annotations/__init__.py b/code_annotations/__init__.py
index cfbe56a..8538025 100644
--- a/code_annotations/__init__.py
+++ b/code_annotations/__init__.py
@@ -2,4 +2,4 @@
 Extensible tools for parsing annotations in codebases.
 """
 
-__version__ = '0.5.1'
+__version__ = '0.6.0'
diff --git a/code_annotations/extensions/base.py b/code_annotations/extensions/base.py
index 186f846..9a45281 100644
--- a/code_annotations/extensions/base.py
+++ b/code_annotations/extensions/base.py
@@ -42,23 +42,26 @@ class SimpleRegexAnnotationExtension(AnnotationExtension, metaclass=ABCMeta):
     # Javascript and Python extensions for examples.
     lang_comment_definition = None
 
-    """
-    This format string/regex finds all comments in the file. The format tokens will be replaced with the
-    language-specific comment definitions defined in the sub-classes.
-
-    Returns two named values: multiline_comment and singleline_comment.
-    """
+    # This format string/regex finds all comments in the file. The format tokens will be replaced with the
+    # language-specific comment definitions defined in the sub-classes.
+    #
+    # Match groupdict will contain two named subgroups: 'comment' and 'prefixed_comment', of which at most
+    # one will be non-None.
     comment_regex_fmt = r"""
-    {multi_start}   # start of the language-specific multi-line comment (ex. /*)
-    (?P<multiline_comment>
-        [\d\D]*?    # capture all of the characters...
-    )
-    {multi_end}     # until you find the end of the language-specific multi-line comment (ex. */)
-    |               # If you don't find any of those...
-    {single}        # start by finding the single-line comment token (ex. //)
-    (?P<singleline_comment>
-    .*              # and capture all characters until the end of the line
-    )
+        {multi_start}           # start of the language-specific multi-line comment (ex. /*)
+        (?P<comment>            # Look for a multiline comment
+            [\d\D]*?            # capture all of the characters...
+        )
+        {multi_end}             # until you find the end of the language-specific multi-line comment (ex. */)
+        |                       # If you don't find any of those...
+        (?P<prefixed_comment>   # Look for a group of single-line comments
+            (?:                 # Non-capture mode
+                {single}        # start by finding the single-line comment token (ex. //)
+                .*              # and capture all characters until the end of the line
+                \n?             # followed by an optional carriage return
+                \ *             # and some empty space
+            )*                  # multiple times
+        )
     """
 
     def __init__(self, config, echo):
@@ -79,6 +82,10 @@ def __init__(self, config, echo):
             self.comment_regex_fmt.format(**self.lang_comment_definition),
             flags=re.VERBOSE
         )
+        self.prefixed_comment_regex = re.compile(
+            r"^ *{single}".format(**self.lang_comment_definition),
+            flags=re.MULTILINE
+        )
 
         # Parent class will allow this class to populate self.strings_to_search via
         # calls to _add_annotation_token or _add_annotation_group for each configured
@@ -105,14 +112,14 @@ def search(self, file_handle):
         if any(anno in txt for anno in self.config.annotation_tokens):
             fname = clean_abs_path(file_handle.name, self.config.source_path)
 
-            # Iterate on all comments: both multi- and single-line.
+            # Iterate on all comments: both prefixed- and non-prefixed.
             for match in self.comment_regex.finditer(txt):
                 # Get the line number by counting newlines + 1 (for the first line).
                 # Note that this is the line number of the beginning of the comment, not the
                 # annotation token itself.
                 line = txt.count('\n', 0, match.start()) + 1
-                # Should only be one match
-                comment_content = match.groupdict()["multiline_comment"] or match.groupdict()["singleline_comment"]
+
+                comment_content = self._find_comment_content(match)
                 for inner_match in self.query.finditer(comment_content):
                     try:
                         annotation_token = inner_match.group('token')
@@ -134,3 +141,27 @@ def search(self, file_handle):
                     })
 
         return found_annotations
+
+    def _find_comment_content(self, match):
+        """
+        Return the comment content as text.
+
+        Args:
+            match (sre.SRE_MATCH): one of the matches of the self.comment_regex regular expression.
+        """
+        comment_content = match.groupdict()["comment"]
+        if comment_content:
+            return comment_content
+
+        # Find single-line comments and strip comment tokens
+        comment_content = match.groupdict()["prefixed_comment"]
+        return self._strip_single_line_comment_tokens(comment_content)
+
+    def _strip_single_line_comment_tokens(self, content):
+        """
+        Strip the leading single-line comment tokens from a comment text.
+
+        Args:
+            content (str): token-prefixed multi-line comment string.
+        """
+        return self.prefixed_comment_regex.sub("", content)
diff --git a/tests/extensions/python_test_files/multiline_singlelinecomment.pyt b/tests/extensions/python_test_files/multiline_singlelinecomment.pyt
new file mode 100644
index 0000000..c59334e
--- /dev/null
+++ b/tests/extensions/python_test_files/multiline_singlelinecomment.pyt
@@ -0,0 +1,7 @@
+# Docstring
+#.. pii: A long description that
+#  spans multiple
+#  lines
+# A comment that is not indented and not part of the above multi-line annotation
+#.. pii_types: id, name
+# Some comment that comes after the multiple-line annotation
diff --git a/tests/extensions/test_base_extensions.py b/tests/extensions/test_base_extensions.py
index cc38f1c..78ce501 100644
--- a/tests/extensions/test_base_extensions.py
+++ b/tests/extensions/test_base_extensions.py
@@ -28,3 +28,19 @@ def test_nothing_found():
     r = FakeExtension(config, VerboseEcho())
     with open('tests/extensions/base_test_files/empty.foo') as f:
         r.search(f)
+
+
+def test_strip_single_line_comment_tokens():
+    config = FakeConfig()
+
+    extension = FakeExtension(config, VerboseEcho())
+    text = """baz line1
+  baz line2
+bazline3
+baz   line4"""
+    expected_result = """ line1
+ line2
+line3
+   line4"""
+    # pylint: disable=protected-access
+    assert expected_result == extension._strip_single_line_comment_tokens(text)
diff --git a/tests/extensions/test_extension_python.py b/tests/extensions/test_extension_python.py
index b41413b..fc8f7a2 100644
--- a/tests/extensions/test_extension_python.py
+++ b/tests/extensions/test_extension_python.py
@@ -76,6 +76,15 @@ def test_grouping_and_choice_failures(test_file, expected_exit_code, expected_me
      Multi-line and multi-paragraph.""")
         ]
     ),
+    (
+        'multiline_singlelinecomment.pyt',
+        [
+            ('.. pii:', """A long description that
+  spans multiple
+  lines"""),
+            ('.. pii_types:', 'id, name'),
+        ]
+    ),
 ])
 def test_multi_line_annotations(test_file, annotations):
     config = AnnotationConfig('tests/test_configurations/.annotations_test')