From 073189ed6b1daae6c254a0b25f0800ce3e9f85c0 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Thu, 3 Oct 2024 14:50:16 +0200 Subject: [PATCH 1/4] python: add test for `re.Match` objects returned from `finditer` --- python/ql/test/library-tests/frameworks/stdlib/test_re.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/ql/test/library-tests/frameworks/stdlib/test_re.py b/python/ql/test/library-tests/frameworks/stdlib/test_re.py index e45a3fe2576d..b95d65619e27 100644 --- a/python/ql/test/library-tests/frameworks/stdlib/test_re.py +++ b/python/ql/test/library-tests/frameworks/stdlib/test_re.py @@ -38,6 +38,12 @@ compiled_pat.match(ts).string, # $ tainted re.compile(ts).match("safe").re.pattern, # $ tainted + + list(re.finditer(pat, ts))[0].string, # $ MISSING: tainted + [m.string for m in re.finditer(pat, ts)], # $ MISSING: tainted + + list(re.finditer(pat, ts))[0].groups()[0], # $ MISSING: tainted + [m.groups()[0] for m in re.finditer(pat, ts)], # $ MISSING: tainted ) ensure_not_tainted( safe_match.expand("Hello \1"), From 494b8bd7e14d0bea50057eca124c203abb5ce584 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Wed, 9 Oct 2024 12:40:47 +0200 Subject: [PATCH 2/4] python: model `string` property of resultof `finditer` --- python/ql/lib/semmle/python/frameworks/Stdlib.qll | 8 ++++++++ python/ql/test/library-tests/frameworks/stdlib/test_re.py | 4 ++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/python/ql/lib/semmle/python/frameworks/Stdlib.qll b/python/ql/lib/semmle/python/frameworks/Stdlib.qll index c7179dbd46c0..950e741d58eb 100644 --- a/python/ql/lib/semmle/python/frameworks/Stdlib.qll +++ b/python/ql/lib/semmle/python/frameworks/Stdlib.qll @@ -3463,6 +3463,14 @@ module StdlibPrivate { ) and preservesValue = false ) + or + // flow from input string to attribute on match object + exists(int arg | arg = methodName.(RegexExecutionMethod).getStringArgIndex() - offset | + input in ["Argument[" + arg + "]", "Argument[string:]"] and + methodName = "finditer" and + output = "ReturnValue.ListElement.Attribute[string]" and + preservesValue = true + ) ) } } diff --git a/python/ql/test/library-tests/frameworks/stdlib/test_re.py b/python/ql/test/library-tests/frameworks/stdlib/test_re.py index b95d65619e27..c2c3c75a979a 100644 --- a/python/ql/test/library-tests/frameworks/stdlib/test_re.py +++ b/python/ql/test/library-tests/frameworks/stdlib/test_re.py @@ -39,8 +39,8 @@ compiled_pat.match(ts).string, # $ tainted re.compile(ts).match("safe").re.pattern, # $ tainted - list(re.finditer(pat, ts))[0].string, # $ MISSING: tainted - [m.string for m in re.finditer(pat, ts)], # $ MISSING: tainted + list(re.finditer(pat, ts))[0].string, # $ tainted + [m.string for m in re.finditer(pat, ts)], # $ tainted list(re.finditer(pat, ts))[0].groups()[0], # $ MISSING: tainted [m.groups()[0] for m in re.finditer(pat, ts)], # $ MISSING: tainted From 0ac4a10345735444ce8c00c62bfbdd5bd8387710 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Wed, 9 Oct 2024 12:42:38 +0200 Subject: [PATCH 3/4] Python: model that `finditer` returns iterable of `re.Match` objects --- .../lib/semmle/python/frameworks/Stdlib.qll | 52 ++++++++++++------- .../frameworks/stdlib/test_re.py | 4 +- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/python/ql/lib/semmle/python/frameworks/Stdlib.qll b/python/ql/lib/semmle/python/frameworks/Stdlib.qll index 950e741d58eb..2bec246bfc0b 100644 --- a/python/ql/lib/semmle/python/frameworks/Stdlib.qll +++ b/python/ql/lib/semmle/python/frameworks/Stdlib.qll @@ -3284,6 +3284,18 @@ module StdlibPrivate { } } + /** + * A base API node for regular expression functions. + * Either the `re` module or a compiled regular expression. + */ + private API::Node re(boolean compiled) { + result = API::moduleImport("re") and + compiled = false + or + result = any(RePatternSummary c).getACall().(API::CallNode).getReturn() and + compiled = true + } + /** * A flow summary for methods returning a `re.Match` object * @@ -3293,17 +3305,18 @@ module StdlibPrivate { ReMatchSummary() { this = ["re.Match", "compiled re.Match"] } override DataFlow::CallCfgNode getACall() { - this = "re.Match" and - result = API::moduleImport("re").getMember(["match", "search", "fullmatch"]).getACall() - or - this = "compiled re.Match" and - result = - any(RePatternSummary c) - .getACall() - .(API::CallNode) - .getReturn() - .getMember(["match", "search", "fullmatch"]) - .getACall() + exists(API::Node re, boolean compiled | + re = re(compiled) and + ( + compiled = false and + this = "re.Match" + or + compiled = true and + this = "compiled re.Match" + ) + | + result = re.getMember(["match", "search", "fullmatch"]).getACall() + ) } override DataFlow::ArgumentNode getACallback() { none() } @@ -3340,6 +3353,13 @@ module StdlibPrivate { } } + /** An API node for a `re.Match` object */ + private API::Node match() { + result = any(ReMatchSummary c).getACall().(API::CallNode).getReturn() + or + result = re(_).getMember("finditer").getReturn().getASubscript() + } + /** * A flow summary for methods on a `re.Match` object * @@ -3353,15 +3373,7 @@ module StdlibPrivate { methodName in ["expand", "group", "groups", "groupdict"] } - override DataFlow::CallCfgNode getACall() { - result = - any(ReMatchSummary c) - .getACall() - .(API::CallNode) - .getReturn() - .getMember(methodName) - .getACall() - } + override DataFlow::CallCfgNode getACall() { result = match().getMember(methodName).getACall() } override DataFlow::ArgumentNode getACallback() { none() } diff --git a/python/ql/test/library-tests/frameworks/stdlib/test_re.py b/python/ql/test/library-tests/frameworks/stdlib/test_re.py index c2c3c75a979a..4cfe5d972b7b 100644 --- a/python/ql/test/library-tests/frameworks/stdlib/test_re.py +++ b/python/ql/test/library-tests/frameworks/stdlib/test_re.py @@ -42,8 +42,8 @@ list(re.finditer(pat, ts))[0].string, # $ tainted [m.string for m in re.finditer(pat, ts)], # $ tainted - list(re.finditer(pat, ts))[0].groups()[0], # $ MISSING: tainted - [m.groups()[0] for m in re.finditer(pat, ts)], # $ MISSING: tainted + list(re.finditer(pat, ts))[0].groups()[0], # $ MISSING: tainted // this requires list content in type tracking + [m.groups()[0] for m in re.finditer(pat, ts)], # $ tainted ) ensure_not_tainted( safe_match.expand("Hello \1"), From 6bd46148e75dc2a4a1c2a76c2f20542662135613 Mon Sep 17 00:00:00 2001 From: Rasmus Lerchedahl Petersen Date: Wed, 9 Oct 2024 16:27:52 +0200 Subject: [PATCH 4/4] Python: add change note --- python/ql/lib/change-notes/2024-10-09-finditer-match.md | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 python/ql/lib/change-notes/2024-10-09-finditer-match.md diff --git a/python/ql/lib/change-notes/2024-10-09-finditer-match.md b/python/ql/lib/change-notes/2024-10-09-finditer-match.md new file mode 100644 index 000000000000..ee2ccc1119a4 --- /dev/null +++ b/python/ql/lib/change-notes/2024-10-09-finditer-match.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* Modelled that `re.finditer` returns an iterable of `re.Match` objects. This is now understood by the API graph in many cases. \ No newline at end of file