Amazon S3¶

Comprehensive guide for using PanPath with Amazon S3.

Prerequisites¶

Installation¶

# Sync support
pip install panpath[s3]

# Async support
pip install panpath[async-s3]

# Both
pip install panpath[s3,async-s3]

AWS Credentials¶

Configure AWS credentials using one of these methods:

Environment VariablesAWS Config FileIAM Role

export AWS_ACCESS_KEY_ID=your_access_key
export AWS_SECRET_ACCESS_KEY=your_secret_key
export AWS_DEFAULT_REGION=us-east-1

# ~/.aws/credentials
[default]
aws_access_key_id = your_access_key
aws_secret_access_key = your_secret_key

# ~/.aws/config
[default]
region = us-east-1

# When running on EC2, ECS, or Lambda
# Credentials are automatically obtained from IAM role
from panpath import PanPath

path = PanPath("s3://bucket/file.txt")
# Uses IAM role credentials automatically

Basic Usage¶

URI Format¶

from panpath import PanPath

# Format: s3://bucket-name/key/path
path = PanPath("s3://my-bucket/data/file.txt")

# Bucket root
bucket = PanPath("s3://my-bucket/")

# Nested paths
nested = PanPath("s3://my-bucket/folder/subfolder/file.txt")

Reading and Writing¶

Text FilesBinary FilesUsing open()

from panpath import PanPath

path = PanPath("s3://my-bucket/data.txt")

# Write text
path.write_text("Hello, S3!")

# Read text
content = path.read_text()
print(content)  # Hello, S3!

# With encoding
path.write_text("Hello", encoding="utf-8")
content = path.read_text(encoding="utf-8")

from panpath import PanPath

path = PanPath("s3://my-bucket/data.bin")

# Write bytes
path.write_bytes(b"\x00\x01\x02\x03")

# Read bytes
data = path.read_bytes()

from panpath import PanPath

path = PanPath("s3://my-bucket/log.txt")

# Write
with path.open("w") as f:
    f.write("Line 1\n")
    f.write("Line 2\n")

# Read
with path.open("r") as f:
    for line in f:
        print(line.strip())

Path Operations¶

from panpath import PanPath

path = PanPath("s3://my-bucket/data/file.txt")

# Path components
print(path.name)        # file.txt
print(path.stem)        # file
print(path.suffix)      # .txt
print(path.parent)      # s3://my-bucket/data

# Join paths
new_path = path.parent / "other.txt"
print(new_path)  # s3://my-bucket/data/other.txt

# Modify components
renamed = path.with_name("newfile.txt")
print(renamed)  # s3://my-bucket/data/newfile.txt

different_ext = path.with_suffix(".csv")
print(different_ext)  # s3://my-bucket/data/file.csv

Directory Operations¶

Listing Objects¶

from panpath import PanPath

bucket = PanPath("s3://my-bucket/data/")

# List all items
for item in bucket.iterdir():
    print(item)

# Find specific files
for txt_file in bucket.glob("*.txt"):
    print(txt_file)

# Recursive search
for py_file in bucket.rglob("*.py"):
    print(py_file)

Walking Directory Tree¶

from panpath import PanPath

bucket = PanPath("s3://my-bucket/")

for dirpath, dirnames, filenames in bucket.walk():
    print(f"Directory: {dirpath}")
    print(f"  Subdirectories: {dirnames}")
    print(f"  Files: {filenames}")

Creating Directories¶

from panpath import PanPath

# Create directory (creates marker object)
directory = PanPath("s3://my-bucket/new-folder/")
directory.mkdir(parents=True, exist_ok=True)

Async Operations¶

Basic Async Usage¶

import asyncio
from panpath import PanPath

async def main():
    path = PanPath("s3://my-bucket/file.txt")

    # Write
    await path.a_write_text("Async content")

    # Read
    content = await path.a_read_text()
    print(content)

asyncio.run(main())

Async Context Manager¶

import asyncio
from panpath import PanPath

async def main():
    path = PanPath("s3://my-bucket/log.txt")

    # Write
    async with path.a_open("w") as f:
        await f.a_write("Line 1\n")
        await f.a_write("Line 2\n")

    # Read
    async with path.a_open("r") as f:
        content = await f.a_read()
        print(content)

asyncio.run(main())

File Position Control (seek/tell)¶

Async file handles support seek() and tell() methods for controlling file position:

import asyncio
from panpath import PanPath

async def read_partial():
    path = PanPath("s3://my-bucket/large-file.bin")

    async with path.a_open("rb") as f:
        # Get current position
        pos = await f.tell()
        print(f"Position: {pos}")  # 0

        # Read first 100 bytes
        chunk = await f.read(100)

        # Check new position
        pos = await f.tell()
        print(f"Position: {pos}")  # 100

        # Seek to specific position
        await f.seek(50)  # Absolute position

        # Read from new position
        chunk = await f.read(50)

        # Seek relative to current position
        await f.seek(10, 1)  # 10 bytes forward

        # Seek from end
        await f.seek(-100, 2)  # 100 bytes before end

asyncio.run(read_partial())

Use Cases for seek/tell

Large file processing: Read specific chunks without downloading the entire file
Resume operations: Track position for resumable downloads
Random access: Jump to specific offsets in structured files
Header parsing: Read file headers without loading the full content

Parallel Operations¶

import asyncio
from panpath import PanPath

async def download_all(uris: list[str]):
    """Download multiple files concurrently."""
    paths = [PanPath(uri) for uri in uris]
    contents = await asyncio.gather(*[p.a_read_text() for p in paths])
    return contents

uris = [
    "s3://my-bucket/file1.txt",
    "s3://my-bucket/file2.txt",
    "s3://my-bucket/file3.txt",
]

asyncio.run(download_all(uris))

Advanced Features¶

Server-Side Copy¶

S3 supports server-side copy for efficient copying within S3:

from panpath import PanPath

# Fast: No download/upload
src = PanPath("s3://my-bucket/source.txt")
src.copy("s3://my-bucket/backup/source.txt")

# Also works across buckets
src.copy("s3://other-bucket/source.txt")

Multipart Upload¶

Large files are automatically handled with multipart upload:

from panpath import PanPath

# Large file (>5GB)
large_file = PanPath("s3://my-bucket/large-file.bin")

# Automatically uses multipart upload
with open("/local/large-file.bin", "rb") as f:
    large_file.write_bytes(f.read())

Object Metadata¶

from panpath import PanPath

path = PanPath("s3://my-bucket/file.txt")

# Get metadata via stat()
stat = path.stat()
print(f"Size: {stat.st_size} bytes")
print(f"Modified: {stat.st_mtime}")

S3-Specific Properties¶

from panpath import PanPath

path = PanPath("s3://my-bucket/folder/file.txt")

# Cloud prefix (bucket)
print(path.cloud_prefix)  # s3://my-bucket

# Key (path within bucket)
print(path.key)  # folder/file.txt

# Bucket name
print(path.bucket)  # my-bucket

Performance Tips¶

1. Use Async for Multiple Operations¶

# Slow: Sequential
from panpath import PanPath

for i in range(100):
    path = PanPath(f"s3://bucket/file{i}.txt")
    content = path.read_text()

# Fast: Concurrent
from panpath import PanPath
import asyncio

async def read_all():
    paths = [PanPath(f"s3://bucket/file{i}.txt") for i in range(100)]
    contents = await asyncio.gather(*[p.a_read_text() for p in paths])
    return contents

asyncio.run(read_all())

2. Use Server-Side Copy¶

# Slow: Downloads then uploads
src = PanPath("s3://bucket/large.bin")
content = src.read_bytes()
dst = PanPath("s3://bucket/backup/large.bin")
dst.write_bytes(content)

# Fast: Server-side copy
src.copy("s3://bucket/backup/large.bin")

3. Batch Operations¶

from panpath import PanPath

# Copy entire directory efficiently
src_dir = PanPath("s3://bucket/data/")
src_dir.copytree("s3://bucket/backup/")

Configuration¶

Custom Client¶

from panpath import PanPath
from panpath.clients import get_s3_client

# Configure with custom settings
client = get_s3_client(
    aws_access_key_id="your_key",
    aws_secret_access_key="your_secret",
    region_name="us-west-2",
    endpoint_url="https://custom-s3-endpoint.com"  # For S3-compatible services
)

# Paths will use this configuration
path = PanPath("s3://bucket/file.txt")

S3-Compatible Services¶

PanPath works with S3-compatible services (MinIO, DigitalOcean Spaces, etc.):

from panpath.clients import get_s3_client

# Configure for MinIO
get_s3_client(
    endpoint_url="http://localhost:9000",
    aws_access_key_id="minioadmin",
    aws_secret_access_key="minioadmin"
)

# Use normally
from panpath import PanPath
path = PanPath("s3://my-bucket/file.txt")

Error Handling¶

from panpath import PanPath
from panpath.exceptions import (
    PathNotFoundError,
    PermissionError,
)
import botocore.exceptions

path = PanPath("s3://bucket/file.txt")

try:
    content = path.read_text()
except PathNotFoundError:
    print("File not found")
except PermissionError:
    print("Access denied")
except botocore.exceptions.NoCredentialsError:
    print("AWS credentials not configured")
except botocore.exceptions.ClientError as e:
    error_code = e.response['Error']['Code']
    if error_code == '404':
        print("File not found")
    elif error_code == '403':
        print("Access denied")

Examples¶

Process CSV Files¶

from panpath import PanPath
import csv
from io import StringIO

def process_csv(s3_uri: str):
    path = PanPath(s3_uri)

    # Read CSV
    content = path.read_text()
    reader = csv.DictReader(StringIO(content))

    # Process rows
    for row in reader:
        print(row)

process_csv("s3://my-bucket/data.csv")

Backup to S3¶

from panpath import PanPath
from datetime import datetime

def backup_to_s3(local_dir: str, s3_bucket: str):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    src = PanPath(local_dir)
    dst = PanPath(f"s3://{s3_bucket}/backup_{timestamp}/")

    print(f"Backing up {src} to {dst}...")
    src.copytree(dst)
    print("Backup complete!")

backup_to_s3("/important/data/", "my-backups")

Download Dataset¶

from panpath import PanPath

def download_dataset(s3_uri: str, local_dir: str):
    src = PanPath(s3_uri)
    dst = PanPath(local_dir)

    print(f"Downloading {src}...")
    src.copytree(dst)
    print(f"Downloaded to {dst}")

download_dataset("s3://datasets/imagenet/", "/data/imagenet/")