Amazon S3¶
Comprehensive guide for using PanPath with Amazon S3.
Prerequisites¶
Installation¶
# Sync support
pip install panpath[s3]
# Async support
pip install panpath[async-s3]
# Both
pip install panpath[s3,async-s3]
AWS Credentials¶
Configure AWS credentials using one of these methods:
Basic Usage¶
URI Format¶
from panpath import PanPath
# Format: s3://bucket-name/key/path
path = PanPath("s3://my-bucket/data/file.txt")
# Bucket root
bucket = PanPath("s3://my-bucket/")
# Nested paths
nested = PanPath("s3://my-bucket/folder/subfolder/file.txt")
Reading and Writing¶
Path Operations¶
from panpath import PanPath
path = PanPath("s3://my-bucket/data/file.txt")
# Path components
print(path.name) # file.txt
print(path.stem) # file
print(path.suffix) # .txt
print(path.parent) # s3://my-bucket/data
# Join paths
new_path = path.parent / "other.txt"
print(new_path) # s3://my-bucket/data/other.txt
# Modify components
renamed = path.with_name("newfile.txt")
print(renamed) # s3://my-bucket/data/newfile.txt
different_ext = path.with_suffix(".csv")
print(different_ext) # s3://my-bucket/data/file.csv
Directory Operations¶
Listing Objects¶
from panpath import PanPath
bucket = PanPath("s3://my-bucket/data/")
# List all items
for item in bucket.iterdir():
print(item)
# Find specific files
for txt_file in bucket.glob("*.txt"):
print(txt_file)
# Recursive search
for py_file in bucket.rglob("*.py"):
print(py_file)
Walking Directory Tree¶
from panpath import PanPath
bucket = PanPath("s3://my-bucket/")
for dirpath, dirnames, filenames in bucket.walk():
print(f"Directory: {dirpath}")
print(f" Subdirectories: {dirnames}")
print(f" Files: {filenames}")
Creating Directories¶
from panpath import PanPath
# Create directory (creates marker object)
directory = PanPath("s3://my-bucket/new-folder/")
directory.mkdir(parents=True, exist_ok=True)
Async Operations¶
Basic Async Usage¶
import asyncio
from panpath import PanPath
async def main():
path = PanPath("s3://my-bucket/file.txt")
# Write
await path.a_write_text("Async content")
# Read
content = await path.a_read_text()
print(content)
asyncio.run(main())
Async Context Manager¶
import asyncio
from panpath import PanPath
async def main():
path = PanPath("s3://my-bucket/log.txt")
# Write
async with path.a_open("w") as f:
await f.a_write("Line 1\n")
await f.a_write("Line 2\n")
# Read
async with path.a_open("r") as f:
content = await f.a_read()
print(content)
asyncio.run(main())
File Position Control (seek/tell)¶
Async file handles support seek() and tell() methods for controlling file position:
import asyncio
from panpath import PanPath
async def read_partial():
path = PanPath("s3://my-bucket/large-file.bin")
async with path.a_open("rb") as f:
# Get current position
pos = await f.tell()
print(f"Position: {pos}") # 0
# Read first 100 bytes
chunk = await f.read(100)
# Check new position
pos = await f.tell()
print(f"Position: {pos}") # 100
# Seek to specific position
await f.seek(50) # Absolute position
# Read from new position
chunk = await f.read(50)
# Seek relative to current position
await f.seek(10, 1) # 10 bytes forward
# Seek from end
await f.seek(-100, 2) # 100 bytes before end
asyncio.run(read_partial())
Use Cases for seek/tell
- Large file processing: Read specific chunks without downloading the entire file
- Resume operations: Track position for resumable downloads
- Random access: Jump to specific offsets in structured files
- Header parsing: Read file headers without loading the full content
Parallel Operations¶
import asyncio
from panpath import PanPath
async def download_all(uris: list[str]):
"""Download multiple files concurrently."""
paths = [PanPath(uri) for uri in uris]
contents = await asyncio.gather(*[p.a_read_text() for p in paths])
return contents
uris = [
"s3://my-bucket/file1.txt",
"s3://my-bucket/file2.txt",
"s3://my-bucket/file3.txt",
]
asyncio.run(download_all(uris))
Advanced Features¶
Server-Side Copy¶
S3 supports server-side copy for efficient copying within S3:
from panpath import PanPath
# Fast: No download/upload
src = PanPath("s3://my-bucket/source.txt")
src.copy("s3://my-bucket/backup/source.txt")
# Also works across buckets
src.copy("s3://other-bucket/source.txt")
Multipart Upload¶
Large files are automatically handled with multipart upload:
from panpath import PanPath
# Large file (>5GB)
large_file = PanPath("s3://my-bucket/large-file.bin")
# Automatically uses multipart upload
with open("/local/large-file.bin", "rb") as f:
large_file.write_bytes(f.read())
Object Metadata¶
from panpath import PanPath
path = PanPath("s3://my-bucket/file.txt")
# Get metadata via stat()
stat = path.stat()
print(f"Size: {stat.st_size} bytes")
print(f"Modified: {stat.st_mtime}")
S3-Specific Properties¶
from panpath import PanPath
path = PanPath("s3://my-bucket/folder/file.txt")
# Cloud prefix (bucket)
print(path.cloud_prefix) # s3://my-bucket
# Key (path within bucket)
print(path.key) # folder/file.txt
# Bucket name
print(path.bucket) # my-bucket
Performance Tips¶
1. Use Async for Multiple Operations¶
# Slow: Sequential
from panpath import PanPath
for i in range(100):
path = PanPath(f"s3://bucket/file{i}.txt")
content = path.read_text()
# Fast: Concurrent
from panpath import PanPath
import asyncio
async def read_all():
paths = [PanPath(f"s3://bucket/file{i}.txt") for i in range(100)]
contents = await asyncio.gather(*[p.a_read_text() for p in paths])
return contents
asyncio.run(read_all())
2. Use Server-Side Copy¶
# Slow: Downloads then uploads
src = PanPath("s3://bucket/large.bin")
content = src.read_bytes()
dst = PanPath("s3://bucket/backup/large.bin")
dst.write_bytes(content)
# Fast: Server-side copy
src.copy("s3://bucket/backup/large.bin")
3. Batch Operations¶
from panpath import PanPath
# Copy entire directory efficiently
src_dir = PanPath("s3://bucket/data/")
src_dir.copytree("s3://bucket/backup/")
Configuration¶
Custom Client¶
from panpath import PanPath
from panpath.clients import get_s3_client
# Configure with custom settings
client = get_s3_client(
aws_access_key_id="your_key",
aws_secret_access_key="your_secret",
region_name="us-west-2",
endpoint_url="https://custom-s3-endpoint.com" # For S3-compatible services
)
# Paths will use this configuration
path = PanPath("s3://bucket/file.txt")
S3-Compatible Services¶
PanPath works with S3-compatible services (MinIO, DigitalOcean Spaces, etc.):
from panpath.clients import get_s3_client
# Configure for MinIO
get_s3_client(
endpoint_url="http://localhost:9000",
aws_access_key_id="minioadmin",
aws_secret_access_key="minioadmin"
)
# Use normally
from panpath import PanPath
path = PanPath("s3://my-bucket/file.txt")
Error Handling¶
from panpath import PanPath
from panpath.exceptions import (
PathNotFoundError,
PermissionError,
)
import botocore.exceptions
path = PanPath("s3://bucket/file.txt")
try:
content = path.read_text()
except PathNotFoundError:
print("File not found")
except PermissionError:
print("Access denied")
except botocore.exceptions.NoCredentialsError:
print("AWS credentials not configured")
except botocore.exceptions.ClientError as e:
error_code = e.response['Error']['Code']
if error_code == '404':
print("File not found")
elif error_code == '403':
print("Access denied")
Examples¶
Process CSV Files¶
from panpath import PanPath
import csv
from io import StringIO
def process_csv(s3_uri: str):
path = PanPath(s3_uri)
# Read CSV
content = path.read_text()
reader = csv.DictReader(StringIO(content))
# Process rows
for row in reader:
print(row)
process_csv("s3://my-bucket/data.csv")
Backup to S3¶
from panpath import PanPath
from datetime import datetime
def backup_to_s3(local_dir: str, s3_bucket: str):
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
src = PanPath(local_dir)
dst = PanPath(f"s3://{s3_bucket}/backup_{timestamp}/")
print(f"Backing up {src} to {dst}...")
src.copytree(dst)
print("Backup complete!")
backup_to_s3("/important/data/", "my-backups")
Download Dataset¶
from panpath import PanPath
def download_dataset(s3_uri: str, local_dir: str):
src = PanPath(s3_uri)
dst = PanPath(local_dir)
print(f"Downloading {src}...")
src.copytree(dst)
print(f"Downloaded to {dst}")
download_dataset("s3://datasets/imagenet/", "/data/imagenet/")
See Also¶
- Quick Start - Basic usage
- Async Operations - Async patterns
- Bulk Operations - Efficient file operations
- API Reference - Complete S3 API