From 24af45172f74e4f01eb21d3aee7beab62417b833 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Tue, 6 Jun 2023 23:50:36 +0100 Subject: GH-102613: Fast recursive globbing in `pathlib.Path.glob()` (GH-104512) This commit introduces a 'walk-and-match' strategy for handling glob patterns that include a non-terminal `**` wildcard, such as `**/*.py`. For this example, the previous implementation recursively walked directories using `os.scandir()` when it expanded the `**` component, and then **scanned those same directories again** when expanded the `*.py` component. This is wasteful. In the new implementation, any components following a `**` wildcard are used to build a `re.Pattern` object, which is used to filter the results of the recursive walk. A pattern like `**/*.py` uses half the number of `os.scandir()` calls; a pattern like `**/*/*.py` a third, etc. This new algorithm does not apply if either: 1. The *follow_symlinks* argument is set to `None` (its default), or 2. The pattern contains `..` components. In these cases we fall back to the old implementation. This commit also replaces selector classes with selector functions. These generators directly yield results rather calling through to their successors. A new internal `Path._glob()` method takes care to chain these generators together, which simplifies the lazy algorithm and slightly improves performance. It should also be easier to understand and maintain. --- Lib/test/test_pathlib.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'Lib/test/test_pathlib.py') diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index da5197dbfac..1a008e5cea3 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -1898,6 +1898,16 @@ class _BasePathTest(object): _check(p, "*B/*", ["dirB/fileB", "dirB/linkD", "linkB/fileB", "linkB/linkD"]) _check(p, "*/fileB", ["dirB/fileB", "linkB/fileB"]) _check(p, "*/", ["dirA", "dirB", "dirC", "dirE", "linkB"]) + _check(p, "dir*/*/..", ["dirC/dirD/..", "dirA/linkC/.."]) + _check(p, "dir*/**/", ["dirA", "dirA/linkC", "dirA/linkC/linkD", "dirB", "dirB/linkD", + "dirC", "dirC/dirD", "dirE"]) + _check(p, "dir*/**/..", ["dirA/..", "dirA/linkC/..", "dirB/..", + "dirC/..", "dirC/dirD/..", "dirE/.."]) + _check(p, "dir*/*/**/", ["dirA/linkC", "dirA/linkC/linkD", "dirB/linkD", "dirC/dirD"]) + _check(p, "dir*/*/**/..", ["dirA/linkC/..", "dirC/dirD/.."]) + _check(p, "dir*/**/fileC", ["dirC/fileC"]) + _check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"]) + _check(p, "*/dirD/**/", ["dirC/dirD"]) @os_helper.skip_unless_symlink def test_glob_no_follow_symlinks_common(self): @@ -1912,6 +1922,14 @@ class _BasePathTest(object): _check(p, "*B/*", ["dirB/fileB", "dirB/linkD"]) _check(p, "*/fileB", ["dirB/fileB"]) _check(p, "*/", ["dirA", "dirB", "dirC", "dirE"]) + _check(p, "dir*/*/..", ["dirC/dirD/.."]) + _check(p, "dir*/**/", ["dirA", "dirB", "dirC", "dirC/dirD", "dirE"]) + _check(p, "dir*/**/..", ["dirA/..", "dirB/..", "dirC/..", "dirC/dirD/..", "dirE/.."]) + _check(p, "dir*/*/**/", ["dirC/dirD"]) + _check(p, "dir*/*/**/..", ["dirC/dirD/.."]) + _check(p, "dir*/**/fileC", ["dirC/fileC"]) + _check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"]) + _check(p, "*/dirD/**/", ["dirC/dirD"]) def test_rglob_common(self): def _check(glob, expected): -- cgit v1.2.3