diff --git a/python/ql/lib/change-notes/2024-10-09-finditer-match.md b/python/ql/lib/change-notes/2024-10-09-finditer-match.md new file mode 100644 index 000000000000..ee2ccc1119a4 --- /dev/null +++ b/python/ql/lib/change-notes/2024-10-09-finditer-match.md @@ -0,0 +1,4 @@ +--- +category: minorAnalysis +--- +* Modelled that `re.finditer` returns an iterable of `re.Match` objects. This is now understood by the API graph in many cases. \ No newline at end of file diff --git a/python/ql/lib/semmle/python/frameworks/Stdlib.qll b/python/ql/lib/semmle/python/frameworks/Stdlib.qll index c7179dbd46c0..2bec246bfc0b 100644 --- a/python/ql/lib/semmle/python/frameworks/Stdlib.qll +++ b/python/ql/lib/semmle/python/frameworks/Stdlib.qll @@ -3284,6 +3284,18 @@ module StdlibPrivate { } } + /** + * A base API node for regular expression functions. + * Either the `re` module or a compiled regular expression. + */ + private API::Node re(boolean compiled) { + result = API::moduleImport("re") and + compiled = false + or + result = any(RePatternSummary c).getACall().(API::CallNode).getReturn() and + compiled = true + } + /** * A flow summary for methods returning a `re.Match` object * @@ -3293,17 +3305,18 @@ module StdlibPrivate { ReMatchSummary() { this = ["re.Match", "compiled re.Match"] } override DataFlow::CallCfgNode getACall() { - this = "re.Match" and - result = API::moduleImport("re").getMember(["match", "search", "fullmatch"]).getACall() - or - this = "compiled re.Match" and - result = - any(RePatternSummary c) - .getACall() - .(API::CallNode) - .getReturn() - .getMember(["match", "search", "fullmatch"]) - .getACall() + exists(API::Node re, boolean compiled | + re = re(compiled) and + ( + compiled = false and + this = "re.Match" + or + compiled = true and + this = "compiled re.Match" + ) + | + result = re.getMember(["match", "search", "fullmatch"]).getACall() + ) } override DataFlow::ArgumentNode getACallback() { none() } @@ -3340,6 +3353,13 @@ module StdlibPrivate { } } + /** An API node for a `re.Match` object */ + private API::Node match() { + result = any(ReMatchSummary c).getACall().(API::CallNode).getReturn() + or + result = re(_).getMember("finditer").getReturn().getASubscript() + } + /** * A flow summary for methods on a `re.Match` object * @@ -3353,15 +3373,7 @@ module StdlibPrivate { methodName in ["expand", "group", "groups", "groupdict"] } - override DataFlow::CallCfgNode getACall() { - result = - any(ReMatchSummary c) - .getACall() - .(API::CallNode) - .getReturn() - .getMember(methodName) - .getACall() - } + override DataFlow::CallCfgNode getACall() { result = match().getMember(methodName).getACall() } override DataFlow::ArgumentNode getACallback() { none() } @@ -3463,6 +3475,14 @@ module StdlibPrivate { ) and preservesValue = false ) + or + // flow from input string to attribute on match object + exists(int arg | arg = methodName.(RegexExecutionMethod).getStringArgIndex() - offset | + input in ["Argument[" + arg + "]", "Argument[string:]"] and + methodName = "finditer" and + output = "ReturnValue.ListElement.Attribute[string]" and + preservesValue = true + ) ) } } diff --git a/python/ql/test/library-tests/frameworks/stdlib/test_re.py b/python/ql/test/library-tests/frameworks/stdlib/test_re.py index e45a3fe2576d..4cfe5d972b7b 100644 --- a/python/ql/test/library-tests/frameworks/stdlib/test_re.py +++ b/python/ql/test/library-tests/frameworks/stdlib/test_re.py @@ -38,6 +38,12 @@ compiled_pat.match(ts).string, # $ tainted re.compile(ts).match("safe").re.pattern, # $ tainted + + list(re.finditer(pat, ts))[0].string, # $ tainted + [m.string for m in re.finditer(pat, ts)], # $ tainted + + list(re.finditer(pat, ts))[0].groups()[0], # $ MISSING: tainted // this requires list content in type tracking + [m.groups()[0] for m in re.finditer(pat, ts)], # $ tainted ) ensure_not_tainted( safe_match.expand("Hello \1"),