From d6865179025813ff7771c98f839f9270d7fee30e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Behmo?= Date: Tue, 25 Aug 2020 20:10:58 +0200 Subject: [PATCH 1/2] [BD-21] Cleaner comment parsing --- code_annotations/extensions/base.py | 41 ++++++++++++++++------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/code_annotations/extensions/base.py b/code_annotations/extensions/base.py index 69fe36d..186f846 100644 --- a/code_annotations/extensions/base.py +++ b/code_annotations/extensions/base.py @@ -42,22 +42,24 @@ class SimpleRegexAnnotationExtension(AnnotationExtension, metaclass=ABCMeta): # Javascript and Python extensions for examples. lang_comment_definition = None - r""" + """ This format string/regex finds all comments in the file. The format tokens will be replaced with the language-specific comment definitions defined in the sub-classes. - {multi_start} - start of the language-specific multi-line comment (ex. /*) - ([\d\D]*?) - capture all of the characters... - {multi_end} - until you find the end of the language-specific multi-line comment (ex. */) - | - If you don't find any of those... - {single} - start by finding the single-line comment token (ex. //) - (.*) - and capture all characters until the end of the line - - Returns a 2-tuple of: - - ("Comment text", None) in the case of a multi-line comment OR - - (None, "Comment text") in the case of a single-line comment + Returns two named values: multiline_comment and singleline_comment. + """ + comment_regex_fmt = r""" + {multi_start} # start of the language-specific multi-line comment (ex. /*) + (?P + [\d\D]*? # capture all of the characters... + ) + {multi_end} # until you find the end of the language-specific multi-line comment (ex. */) + | # If you don't find any of those... + {single} # start by finding the single-line comment token (ex. //) + (?P + .* # and capture all characters until the end of the line + ) """ - comment_regex_fmt = r'{multi_start}([\d\D]*?){multi_end}|{single}(.*)' def __init__(self, config, echo): """ @@ -74,7 +76,8 @@ def __init__(self, config, echo): # pylint: disable=not-a-mapping self.comment_regex = re.compile( - self.comment_regex_fmt.format(**self.lang_comment_definition) + self.comment_regex_fmt.format(**self.lang_comment_definition), + flags=re.VERBOSE ) # Parent class will allow this class to populate self.strings_to_search via @@ -102,15 +105,15 @@ def search(self, file_handle): if any(anno in txt for anno in self.config.annotation_tokens): fname = clean_abs_path(file_handle.name, self.config.source_path) + # Iterate on all comments: both multi- and single-line. for match in self.comment_regex.finditer(txt): + # Get the line number by counting newlines + 1 (for the first line). + # Note that this is the line number of the beginning of the comment, not the + # annotation token itself. + line = txt.count('\n', 0, match.start()) + 1 # Should only be one match - comment_content = [item for item in match.groups() if item is not None][0] + comment_content = match.groupdict()["multiline_comment"] or match.groupdict()["singleline_comment"] for inner_match in self.query.finditer(comment_content): - # Get the line number by counting newlines + 1 (for the first line). - # Note that this is the line number of the beginning of the comment, not the - # annotation token itself. - line = txt.count('\n', 0, match.start()) + 1 - try: annotation_token = inner_match.group('token') annotation_data = inner_match.group('data') From 097db99c595bfb69873fdff7d80be697dd3f7934 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Behmo?= Date: Thu, 27 Aug 2020 16:07:53 +0200 Subject: [PATCH 2/2] Add support for multiline annotations with single-line prefix ("#") Multiline annotations were previously supported only for multi-line comments. In Python: """...""" This introduces multiline annotations for comments prefixed by single-line comment signs. In Python: # .. pii: a multiline annotation # that spans multiple lines. This makes it possible to wrap long comment lines more naturally, in particular in Python. --- CHANGELOG.rst | 5 ++ code_annotations/__init__.py | 2 +- code_annotations/extensions/base.py | 69 ++++++++++++++----- .../multiline_singlelinecomment.pyt | 7 ++ tests/extensions/test_base_extensions.py | 16 +++++ tests/extensions/test_extension_python.py | 9 +++ 6 files changed, 88 insertions(+), 20 deletions(-) create mode 100644 tests/extensions/python_test_files/multiline_singlelinecomment.pyt diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 06dabff..65cca76 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -11,6 +11,11 @@ Change Log .. There should always be an "Unreleased" section for changes pending release. +[0.6.0] - 2020-08-27 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* Add support for multiline annotations for lines prefixed with single-line comment signs ("#") + [0.5.1] - 2020-08-25 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/code_annotations/__init__.py b/code_annotations/__init__.py index cfbe56a..8538025 100644 --- a/code_annotations/__init__.py +++ b/code_annotations/__init__.py @@ -2,4 +2,4 @@ Extensible tools for parsing annotations in codebases. """ -__version__ = '0.5.1' +__version__ = '0.6.0' diff --git a/code_annotations/extensions/base.py b/code_annotations/extensions/base.py index 186f846..9a45281 100644 --- a/code_annotations/extensions/base.py +++ b/code_annotations/extensions/base.py @@ -42,23 +42,26 @@ class SimpleRegexAnnotationExtension(AnnotationExtension, metaclass=ABCMeta): # Javascript and Python extensions for examples. lang_comment_definition = None - """ - This format string/regex finds all comments in the file. The format tokens will be replaced with the - language-specific comment definitions defined in the sub-classes. - - Returns two named values: multiline_comment and singleline_comment. - """ + # This format string/regex finds all comments in the file. The format tokens will be replaced with the + # language-specific comment definitions defined in the sub-classes. + # + # Match groupdict will contain two named subgroups: 'comment' and 'prefixed_comment', of which at most + # one will be non-None. comment_regex_fmt = r""" - {multi_start} # start of the language-specific multi-line comment (ex. /*) - (?P - [\d\D]*? # capture all of the characters... - ) - {multi_end} # until you find the end of the language-specific multi-line comment (ex. */) - | # If you don't find any of those... - {single} # start by finding the single-line comment token (ex. //) - (?P - .* # and capture all characters until the end of the line - ) + {multi_start} # start of the language-specific multi-line comment (ex. /*) + (?P # Look for a multiline comment + [\d\D]*? # capture all of the characters... + ) + {multi_end} # until you find the end of the language-specific multi-line comment (ex. */) + | # If you don't find any of those... + (?P # Look for a group of single-line comments + (?: # Non-capture mode + {single} # start by finding the single-line comment token (ex. //) + .* # and capture all characters until the end of the line + \n? # followed by an optional carriage return + \ * # and some empty space + )* # multiple times + ) """ def __init__(self, config, echo): @@ -79,6 +82,10 @@ def __init__(self, config, echo): self.comment_regex_fmt.format(**self.lang_comment_definition), flags=re.VERBOSE ) + self.prefixed_comment_regex = re.compile( + r"^ *{single}".format(**self.lang_comment_definition), + flags=re.MULTILINE + ) # Parent class will allow this class to populate self.strings_to_search via # calls to _add_annotation_token or _add_annotation_group for each configured @@ -105,14 +112,14 @@ def search(self, file_handle): if any(anno in txt for anno in self.config.annotation_tokens): fname = clean_abs_path(file_handle.name, self.config.source_path) - # Iterate on all comments: both multi- and single-line. + # Iterate on all comments: both prefixed- and non-prefixed. for match in self.comment_regex.finditer(txt): # Get the line number by counting newlines + 1 (for the first line). # Note that this is the line number of the beginning of the comment, not the # annotation token itself. line = txt.count('\n', 0, match.start()) + 1 - # Should only be one match - comment_content = match.groupdict()["multiline_comment"] or match.groupdict()["singleline_comment"] + + comment_content = self._find_comment_content(match) for inner_match in self.query.finditer(comment_content): try: annotation_token = inner_match.group('token') @@ -134,3 +141,27 @@ def search(self, file_handle): }) return found_annotations + + def _find_comment_content(self, match): + """ + Return the comment content as text. + + Args: + match (sre.SRE_MATCH): one of the matches of the self.comment_regex regular expression. + """ + comment_content = match.groupdict()["comment"] + if comment_content: + return comment_content + + # Find single-line comments and strip comment tokens + comment_content = match.groupdict()["prefixed_comment"] + return self._strip_single_line_comment_tokens(comment_content) + + def _strip_single_line_comment_tokens(self, content): + """ + Strip the leading single-line comment tokens from a comment text. + + Args: + content (str): token-prefixed multi-line comment string. + """ + return self.prefixed_comment_regex.sub("", content) diff --git a/tests/extensions/python_test_files/multiline_singlelinecomment.pyt b/tests/extensions/python_test_files/multiline_singlelinecomment.pyt new file mode 100644 index 0000000..c59334e --- /dev/null +++ b/tests/extensions/python_test_files/multiline_singlelinecomment.pyt @@ -0,0 +1,7 @@ +# Docstring +#.. pii: A long description that +# spans multiple +# lines +# A comment that is not indented and not part of the above multi-line annotation +#.. pii_types: id, name +# Some comment that comes after the multiple-line annotation diff --git a/tests/extensions/test_base_extensions.py b/tests/extensions/test_base_extensions.py index cc38f1c..78ce501 100644 --- a/tests/extensions/test_base_extensions.py +++ b/tests/extensions/test_base_extensions.py @@ -28,3 +28,19 @@ def test_nothing_found(): r = FakeExtension(config, VerboseEcho()) with open('tests/extensions/base_test_files/empty.foo') as f: r.search(f) + + +def test_strip_single_line_comment_tokens(): + config = FakeConfig() + + extension = FakeExtension(config, VerboseEcho()) + text = """baz line1 + baz line2 +bazline3 +baz line4""" + expected_result = """ line1 + line2 +line3 + line4""" + # pylint: disable=protected-access + assert expected_result == extension._strip_single_line_comment_tokens(text) diff --git a/tests/extensions/test_extension_python.py b/tests/extensions/test_extension_python.py index b41413b..fc8f7a2 100644 --- a/tests/extensions/test_extension_python.py +++ b/tests/extensions/test_extension_python.py @@ -76,6 +76,15 @@ def test_grouping_and_choice_failures(test_file, expected_exit_code, expected_me Multi-line and multi-paragraph.""") ] ), + ( + 'multiline_singlelinecomment.pyt', + [ + ('.. pii:', """A long description that + spans multiple + lines"""), + ('.. pii_types:', 'id, name'), + ] + ), ]) def test_multi_line_annotations(test_file, annotations): config = AnnotationConfig('tests/test_configurations/.annotations_test')