Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 53 additions & 73 deletions adlfs/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,13 @@ def __init__(
max_concurrency = batch_size
self.max_concurrency = max_concurrency

@property
def account_url(self) -> str:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Technically, to keep the existing behavior, we will need to check whether the connection string is set and if so return None. We should add that since if a connection_string is provided, an account_name would not be and we could end up encoding None into the account url.

While it is possible to piece together an account url from a connection string, that would be quite a bit of logic for the purpose of this PR.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll update it to return None

if hasattr(self, "account_host"):
return f"https://{self.account_host}"
else:
return f"https://{self.account_name}.blob.core.windows.net"

@classmethod
def _strip_protocol(cls, path: str):
"""
Expand Down Expand Up @@ -464,55 +471,15 @@ def _get_default_azure_credential(self, **kwargs):

def do_connect(self):
"""Connect to the BlobServiceClient, using user-specified connection details.
Tries credentials first, then connection string and finally account key
Tries connection string first, then credential and finally account key

Raises
------
ValueError if none of the connection details are available
"""

try:
if self.connection_string is not None:
self.service_client = AIOBlobServiceClient.from_connection_string(
conn_str=self.connection_string
)
elif self.account_name is not None:
if hasattr(self, "account_host"):
self.account_url: str = f"https://{self.account_host}"
else:
self.account_url: str = (
f"https://{self.account_name}.blob.core.windows.net"
)

creds = [self.credential, self.account_key]
if any(creds):
self.service_client = [
AIOBlobServiceClient(
account_url=self.account_url,
credential=cred,
_location_mode=self.location_mode,
)
for cred in creds
if cred is not None
][0]
elif self.sas_token is not None:
if not self.sas_token.startswith("?"):
self.sas_token = f"?{self.sas_token}"
self.service_client = AIOBlobServiceClient(
account_url=self.account_url + self.sas_token,
credential=None,
_location_mode=self.location_mode,
)
else:
# Fall back to anonymous login, and assume public container
self.service_client = AIOBlobServiceClient(
account_url=self.account_url
)
else:
raise ValueError(
"Must provide either a connection_string or account_name with credentials!!"
)

self.service_client = self._get_service_client()
except RuntimeError:
loop = get_loop()
asyncio.set_event_loop(loop)
Expand All @@ -521,6 +488,46 @@ def do_connect(self):
except Exception as e:
raise ValueError(f"unable to connect to account for {e}") from e

def _get_service_client(self) -> AIOBlobServiceClient:

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we hoist this helper internal method to the module level instead of making it a method on the class? Mainly that helps better avoid accessing internal methods of one class from another class.

@jamesmyatt jamesmyatt Jun 15, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really. It uses a lot of AzureBlobFileSystem attributes, so it needs the AzureBlobFileSystem object, and might as well just be a method. So I'd prefer to make it a public method (create_service_client), if it needs to be called from other classes.

The other option is to make a public method that prepares the arguments (e.g. get_client_kwargs), and then calls a module-level helper to actually create the AIOBlobServiceClient.

"""Connect to the Asynchronous BlobServiceClient.

Tries connection string first, then credential and finally account key
"""
# Shortcut for connection string
if self.connection_string is not None:
logger.info("Connect using connection string")
return AIOBlobServiceClient.from_connection_string(
conn_str=self.connection_string
)

if self.account_name is not None:
kwargs = {
"account_url": self.account_url,
"credential": None,
"_location_mode": self.location_mode,
}
creds = ["credential", "sync_credential", "account_key"]

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any ideas on why the file system's client and the file-object's client were out of sync in terms of credential resolution here in the first place? I need to go through the git history on how these became out of sync, but I'd probably lean toward even removing sync_credential from that list so that:

  1. We have complete parity with the do_connect() method. Typically this is the method that most people will use to authenticate since by default even in the AzureBlobFile class, it will just reuse the client from the file system unless connect_client() is explicitly called.
  2. While technically it is possible to provide a sync credential to an async client, it does not seem like a practice that we should be continuing, instead async clients should be using async credentials.

I'll look more into this and circle back on what we do here.

@jamesmyatt jamesmyatt Jun 15, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this is the only place that sync_credential is used, so I'd be tempted to remove it entirely. What do you think?

I also think that this method should use exactly the same logic as do_connect. Otherwise it will only cause confusion. To me, it looks like a copy-and-paste that has not been properly updated.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I'm fairly sure sync_credentials can be removed: #551

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's also the question of why connect_client is creating a new service client, when the associated fs object should already have one.

for name in creds:
if (cred := getattr(self, name, None)) is not None:
logger.info("Connect using %s", name.replace("_", " "))

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For these logging statements, let's set the level to debug. Mainly debug is the prevalent level used in the codebase and these statements are intended for debugging which credentials are used.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK

kwargs["credential"] = cred
break
else:
if self.sas_token is not None:
logger.info("Connect using SAS token")
if not self.sas_token.startswith("?"):
self.sas_token = f"?{self.sas_token}"
Comment on lines +518 to +519

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be interesting if we could hoist this logic to the __init__. Mainly, going over this method now this is the last place where any mutation is happening on the file system. So if we move it, it give the nice property that this method does not mutate state.

@jamesmyatt jamesmyatt Jun 15, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree this shouldn't be mutating self, but I think this is actually just a hack for the way that the overall URL is constructed (i.e. query part). What is expected to be the value of sas_token passed to the constructor? The azure docs say that the "?" delimiter is not part of the SAS token: https://learn.microsoft.com/en-us/azure/storage/common/storage-sas-overview?toc=/azure/storage/blobs/toc.json&bc=/azure/storage/blobs/breadcrumb/toc.json#sas-token.

The URL can be reconstructed with urllib.parse I think.

kwargs["account_url"] = f'{kwargs["account_url"]}{self.sas_token}'
else:
logger.info("Connect using anonymous login")

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the previous logic, the location_mode was never used in anonymous mode, meaning if a value of secondary was set adlfs would still use the primary endpoint. I'd lean toward retaining that behavior for now since the PR is more about refactoring and logging. But I'm open to respecting location_mode for anonymous connections if it supported for GRS accounts but would need to double check to confirm if it is.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK

# Fall back to anonymous login, and assume public container

return AIOBlobServiceClient(**kwargs)

raise ValueError(
"Must provide either a connection_string or account_name with credentials!!"
)

def split_path(
self, path, delimiter="/", return_container: bool = False, **kwargs
) -> Tuple[str, str, Optional[str]]:
Expand Down Expand Up @@ -2030,43 +2037,16 @@ def close(self):

def connect_client(self):
"""Connect to the Asynchronous BlobServiceClient, using user-specified connection details.
Tries credentials first, then connection string and finally account key
Tries connection string first, then credential and finally account key

Raises
------
ValueError if none of the connection details are available
"""
try:
if hasattr(self.fs, "account_host"):
self.fs.account_url: str = f"https://{self.fs.account_host}"
else:
self.fs.account_url: str = (
f"https://{self.fs.account_name}.blob.core.windows.net"
)

creds = [self.fs.sync_credential, self.fs.account_key, self.fs.credential]
if any(creds):
self.container_client = [
AIOBlobServiceClient(
account_url=self.fs.account_url,
credential=cred,
_location_mode=self.fs.location_mode,
).get_container_client(self.container_name)
for cred in creds
if cred is not None
][0]
elif self.fs.connection_string is not None:
self.container_client = AIOBlobServiceClient.from_connection_string(
conn_str=self.fs.connection_string
).get_container_client(self.container_name)
elif self.fs.sas_token is not None:
self.container_client = AIOBlobServiceClient(
account_url=self.fs.account_url + self.fs.sas_token, credential=None
).get_container_client(self.container_name)
else:
self.container_client = AIOBlobServiceClient(
account_url=self.fs.account_url
).get_container_client(self.container_name)
self.container_client = self.fs._get_service_client().get_container_client(
self.container_name
)

except Exception as e:
raise ValueError(
Expand Down
Loading