Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion s3fs/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -852,6 +852,9 @@ async def _lsdir(
versions=False,
):
bucket, key, _ = self.split_path(path)
# A caller-supplied prefix is a stem filter, so the listing is partial
# and must not be cached. Capture it before prefix is overwritten below.
partial = bool(prefix)
if not prefix:
prefix = ""
if key:
Expand All @@ -877,7 +880,7 @@ async def _lsdir(
except ClientError as e:
raise translate_boto_error(e)

if delimiter and files and not versions:
if delimiter and files and not versions and not partial:
self.dircache[path] = files
return files
return self.dircache[path]
Expand Down
53 changes: 53 additions & 0 deletions s3fs/tests/test_s3fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3341,3 +3341,56 @@ async def run():
aio_session.AioSession, "_create_client", counting_create_client
):
asyncio.run(run())


def test_find_with_prefix_does_not_poison_dircache(s3):
data_dir = test_bucket_name + "/splits"
s3.touch(data_dir + "/train-00000.parquet")
s3.touch(data_dir + "/test-00000.parquet")
s3.invalidate_cache()

train_results = s3.find(data_dir, prefix="train-", maxdepth=1)
assert len(train_results) == 1
assert data_dir + "/train-00000.parquet" in train_results

all_files = {f.split("/")[-1] for f in s3.ls(data_dir)}
assert all_files == {"train-00000.parquet", "test-00000.parquet"}


def test_glob_prefix_does_not_poison_dircache(s3):
data_dir = test_bucket_name + "/globs"
s3.touch(data_dir + "/train-00000.parquet")
s3.touch(data_dir + "/test-00000.parquet")
s3.invalidate_cache()

assert len(s3.glob(data_dir + "/train-*")) == 1
test_hits = s3.glob(data_dir + "/test-*")
assert len(test_hits) == 1
assert data_dir + "/test-00000.parquet" in test_hits


def test_find_with_prefix_preserves_existing_full_cache(s3):
data_dir = test_bucket_name + "/warm"
s3.touch(data_dir + "/train-00000.parquet")
s3.touch(data_dir + "/test-00000.parquet")
s3.invalidate_cache()

assert len(s3.ls(data_dir)) == 2

s3.find(data_dir, prefix="train-", maxdepth=1)

cached = {f.split("/")[-1] for f in s3.ls(data_dir)}
assert cached == {"train-00000.parquet", "test-00000.parquet"}


def test_normal_ls_and_find_still_populate_dircache(s3):
data_dir = test_bucket_name + "/normal"
s3.touch(data_dir + "/file-a")
s3.touch(data_dir + "/file-b")
s3.invalidate_cache()

assert data_dir not in s3.dircache

s3.ls(data_dir)
assert data_dir in s3.dircache
assert len(s3.dircache[data_dir]) == 2
Loading