Skip to content

streaming ingestion support for PUT operation #643

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions examples/streaming_put.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/usr/bin/env python3
"""
Simple example of streaming PUT operations.

This demonstrates the basic usage of streaming PUT with the __input_stream__ token.
"""

import io
import os
from databricks import sql

with sql.connect(
server_hostname=os.getenv("DATABRICKS_SERVER_HOSTNAME"),
http_path=os.getenv("DATABRICKS_HTTP_PATH"),
access_token=os.getenv("DATABRICKS_TOKEN"),
) as connection:

with connection.cursor() as cursor:
# Create a simple data stream
data = b"Hello, streaming world!"
stream = io.BytesIO(data)

# Get catalog, schema, and volume from environment variables
catalog = os.getenv("DATABRICKS_CATALOG")
schema = os.getenv("DATABRICKS_SCHEMA")
volume = os.getenv("DATABRICKS_VOLUME")

# Upload to Unity Catalog volume
cursor.execute(
f"PUT '__input_stream__' INTO '/Volumes/{catalog}/{schema}/{volume}/hello.txt' OVERWRITE",
input_stream=stream
)

print("File uploaded successfully!")
132 changes: 96 additions & 36 deletions src/databricks/sql/client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import time
from typing import Dict, Tuple, List, Optional, Any, Union, Sequence
from typing import Dict, Tuple, List, Optional, Any, Union, Sequence, BinaryIO
import pandas

try:
Expand Down Expand Up @@ -67,6 +67,7 @@
)
from databricks.sql.telemetry.latency_logger import log_latency
from databricks.sql.telemetry.models.enums import StatementType
from databricks.sql.common.http import DatabricksHttpClient, HttpMethod

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -615,8 +616,34 @@ def _check_not_closed(self):
session_id_hex=self.connection.get_session_id_hex(),
)

def _validate_staging_http_response(
self, response: requests.Response, operation_name: str = "staging operation"
) -> None:

# Check response codes
OK = requests.codes.ok # 200
CREATED = requests.codes.created # 201
ACCEPTED = requests.codes.accepted # 202
NO_CONTENT = requests.codes.no_content # 204

if response.status_code not in [OK, CREATED, NO_CONTENT, ACCEPTED]:
raise OperationalError(
f"{operation_name} over HTTP was unsuccessful: {response.status_code}-{response.text}",
session_id_hex=self.connection.get_session_id_hex(),
)

if response.status_code == ACCEPTED:
logger.debug(
"Response code %s from server indicates %s was accepted "
"but not yet applied on the server. It's possible this command may fail later.",
ACCEPTED,
operation_name,
)

def _handle_staging_operation(
self, staging_allowed_local_path: Union[None, str, List[str]]
self,
staging_allowed_local_path: Union[None, str, List[str]],
input_stream: Optional[BinaryIO] = None,
):
"""Fetch the HTTP request instruction from a staging ingestion command
and call the designated handler.
Expand All @@ -625,6 +652,28 @@ def _handle_staging_operation(
is not descended from staging_allowed_local_path.
"""

assert self.active_result_set is not None
row = self.active_result_set.fetchone()
assert row is not None

# Parse headers
headers = (
json.loads(row.headers) if isinstance(row.headers, str) else row.headers
)
headers = dict(headers) if headers else {}

# Handle __input_stream__ token for PUT operations
if (
row.operation == "PUT"
and getattr(row, "localFile", None) == "__input_stream__"
):
return self._handle_staging_put_stream(
presigned_url=row.presignedUrl,
stream=input_stream,
headers=headers,
)

# For non-streaming operations, validate staging_allowed_local_path
if isinstance(staging_allowed_local_path, type(str())):
_staging_allowed_local_paths = [staging_allowed_local_path]
elif isinstance(staging_allowed_local_path, type(list())):
Expand All @@ -639,10 +688,6 @@ def _handle_staging_operation(
os.path.abspath(i) for i in _staging_allowed_local_paths
]

assert self.active_result_set is not None
row = self.active_result_set.fetchone()
assert row is not None

# Must set to None in cases where server response does not include localFile
abs_localFile = None

Expand All @@ -665,19 +710,16 @@ def _handle_staging_operation(
session_id_hex=self.connection.get_session_id_hex(),
)

# May be real headers, or could be json string
headers = (
json.loads(row.headers) if isinstance(row.headers, str) else row.headers
)

handler_args = {
"presigned_url": row.presignedUrl,
"local_file": abs_localFile,
"headers": dict(headers) or {},
"headers": headers,
}

logger.debug(
f"Attempting staging operation indicated by server: {row.operation} - {getattr(row, 'localFile', '')}"
"Attempting staging operation indicated by server: %s - %s",
row.operation,
getattr(row, "localFile", ""),
)

# TODO: Create a retry loop here to re-attempt if the request times out or fails
Expand All @@ -696,6 +738,43 @@ def _handle_staging_operation(
session_id_hex=self.connection.get_session_id_hex(),
)

@log_latency(StatementType.SQL)
def _handle_staging_put_stream(
self,
presigned_url: str,
stream: BinaryIO,
headers: dict = {},
) -> None:
"""Handle PUT operation with streaming data.

Args:
presigned_url: The presigned URL for upload
stream: Binary stream to upload
headers: HTTP headers

Raises:
ProgrammingError: If no input stream is provided
OperationalError: If the upload fails
"""

if not stream:
raise ProgrammingError(
"No input stream provided for streaming operation",
session_id_hex=self.connection.get_session_id_hex(),
)

http_client = DatabricksHttpClient.get_instance()

# Stream directly to presigned URL
with http_client.execute(
method=HttpMethod.PUT,
url=presigned_url,
data=stream,
headers=headers,
timeout=300, # 5 minute timeout
) as response:
self._validate_staging_http_response(response, "stream upload")

@log_latency(StatementType.SQL)
def _handle_staging_put(
self, presigned_url: str, local_file: str, headers: Optional[dict] = None
Expand All @@ -714,27 +793,7 @@ def _handle_staging_put(
with open(local_file, "rb") as fh:
r = requests.put(url=presigned_url, data=fh, headers=headers)

# fmt: off
# Design borrowed from: https://stackoverflow.com/a/2342589/5093960

OK = requests.codes.ok # 200
CREATED = requests.codes.created # 201
ACCEPTED = requests.codes.accepted # 202
NO_CONTENT = requests.codes.no_content # 204

# fmt: on

if r.status_code not in [OK, CREATED, NO_CONTENT, ACCEPTED]:
raise OperationalError(
f"Staging operation over HTTP was unsuccessful: {r.status_code}-{r.text}",
session_id_hex=self.connection.get_session_id_hex(),
)

if r.status_code == ACCEPTED:
logger.debug(
f"Response code {ACCEPTED} from server indicates ingestion command was accepted "
+ "but not yet applied on the server. It's possible this command may fail later."
)
self._validate_staging_http_response(r, "file upload")

@log_latency(StatementType.SQL)
def _handle_staging_get(
Expand Down Expand Up @@ -784,6 +843,7 @@ def execute(
operation: str,
parameters: Optional[TParameterCollection] = None,
enforce_embedded_schema_correctness=False,
input_stream: Optional[BinaryIO] = None,
) -> "Cursor":
"""
Execute a query and wait for execution to complete.
Expand Down Expand Up @@ -820,7 +880,6 @@ def execute(
logger.debug(
"Cursor.execute(operation=%s, parameters=%s)", operation, parameters
)

param_approach = self._determine_parameter_approach(parameters)
if param_approach == ParameterApproach.NONE:
prepared_params = NO_NATIVE_PARAMS
Expand Down Expand Up @@ -857,7 +916,8 @@ def execute(

if self.active_result_set and self.active_result_set.is_staging_operation:
self._handle_staging_operation(
staging_allowed_local_path=self.connection.staging_allowed_local_path
staging_allowed_local_path=self.connection.staging_allowed_local_path,
input_stream=input_stream,
)

return self
Expand Down
65 changes: 65 additions & 0 deletions tests/e2e/common/streaming_put_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/usr/bin/env python3
"""
E2E tests for streaming PUT operations.
"""

import io
import logging
import pytest
from datetime import datetime

logger = logging.getLogger(__name__)


class PySQLStreamingPutTestSuiteMixin:
"""Test suite for streaming PUT operations."""

def test_streaming_put_basic(self, catalog, schema):
"""Test basic streaming PUT functionality."""

# Create test data
test_data = b"Hello, streaming world! This is test data."
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"stream_test_{timestamp}.txt"
file_path = f"/Volumes/{catalog}/{schema}/e2etests/{filename}"

try:
with self.connection() as conn:
with conn.cursor() as cursor:
with io.BytesIO(test_data) as stream:
cursor.execute(
f"PUT '__input_stream__' INTO '{file_path}'",
input_stream=stream
)

# Verify file exists
cursor.execute(f"LIST '/Volumes/{catalog}/{schema}/e2etests/'")
files = cursor.fetchall()

# Check if our file is in the list
file_paths = [row[0] for row in files]
assert file_path in file_paths, f"File {file_path} not found in {file_paths}"
finally:
self._cleanup_test_file(file_path)

def test_streaming_put_missing_stream(self, catalog, schema):
"""Test that missing stream raises appropriate error."""

with self.connection() as conn:
with conn.cursor() as cursor:
# Test without providing stream
with pytest.raises(Exception): # Should fail
cursor.execute(
f"PUT '__input_stream__' INTO '/Volumes/{catalog}/{schema}/e2etests/test.txt'"
# Note: No input_stream parameter
)

def _cleanup_test_file(self, file_path):
"""Clean up a test file if it exists."""
try:
with self.connection(extra_params={"staging_allowed_local_path": "/"}) as conn:
with conn.cursor() as cursor:
cursor.execute(f"REMOVE '{file_path}'")
logger.info("Successfully cleaned up test file: %s", file_path)
except Exception as e:
logger.error("Cleanup failed for %s: %s", file_path, e)
3 changes: 2 additions & 1 deletion tests/e2e/test_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@
)
from tests.e2e.common.staging_ingestion_tests import PySQLStagingIngestionTestSuiteMixin
from tests.e2e.common.retry_test_mixins import PySQLRetryTestsMixin

from tests.e2e.common.uc_volume_tests import PySQLUCVolumeTestSuiteMixin
from tests.e2e.common.streaming_put_tests import PySQLStreamingPutTestSuiteMixin

from databricks.sql.exc import SessionAlreadyClosedError

Expand Down Expand Up @@ -256,6 +256,7 @@ class TestPySQLCoreSuite(
PySQLStagingIngestionTestSuiteMixin,
PySQLRetryTestsMixin,
PySQLUCVolumeTestSuiteMixin,
PySQLStreamingPutTestSuiteMixin,
):
validate_row_value_type = True
validate_result = True
Expand Down
Loading
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy