Advanced Memory Patterns and Performance · JAVA

12.3 Advanced Memory Patterns and Performance

Optimize memory-intensive operations with advanced patterns for zero-copy, vectorization, and efficient data structures.

Zero-Copy Patterns

Direct Buffer Integration:

import java.lang.foreign.*;
import java.nio.ByteBuffer;

public class ZeroCopyPatterns {
    public void directBufferToSegment() {
        // Allocate direct ByteBuffer
        ByteBuffer buffer = ByteBuffer.allocateDirect(1024);

        // Fill buffer
        buffer.putInt(42);
        buffer.putLong(123456789L);
        buffer.flip();

        // Wrap as MemorySegment (zero-copy)
        MemorySegment segment = MemorySegment.ofBuffer(buffer);

        // Read from segment (same underlying memory)
        int value = segment.get(ValueLayout.JAVA_INT, 0);
        long longValue = segment.get(ValueLayout.JAVA_LONG, 4);

        System.out.println("Int: " + value + ", Long: " + longValue);
    }

    public void networkReceiveZeroCopy(java.nio.channels.SocketChannel socket) 
            throws Exception {
        try (Arena arena = Arena.ofConfined()) {
            // Allocate buffer
            MemorySegment buffer = arena.allocate(8192);

            // Get ByteBuffer view (zero-copy)
            ByteBuffer byteBuffer = buffer.asByteBuffer();

            // Receive directly into native memory
            int bytesRead = socket.read(byteBuffer);

            // Process data in native memory without copying
            if (bytesRead > 0) {
                processData(buffer.asSlice(0, bytesRead));
            }
        }
    }

    private void processData(MemorySegment data) {
        // Process without additional copies
        int messageType = data.get(ValueLayout.JAVA_INT, 0);
        System.out.println("Message type: " + messageType);
    }
}

Memory-Mapped File Zero-Copy:

import java.nio.channels.FileChannel;
import java.nio.file.*;

public class MappedFileZeroCopy {
    public void processLargeFile(Path path) throws Exception {
        try (FileChannel channel = FileChannel.open(path, StandardOpenOption.READ);
             Arena arena = Arena.ofShared()) {

            long fileSize = channel.size();

            // Map entire file (zero-copy access)
            MemorySegment mapped = channel.map(
                FileChannel.MapMode.READ_ONLY,
                0,
                fileSize,
                arena
            );

            // Process file in chunks without loading into heap
            long chunkSize = 1024 * 1024;  // 1 MB chunks
            long offset = 0;

            while (offset < fileSize) {
                long remaining = fileSize - offset;
                long currentChunk = Math.min(chunkSize, remaining);

                MemorySegment chunk = mapped.asSlice(offset, currentChunk);
                processChunk(chunk);

                offset += currentChunk;
            }
        }
    }

    private void processChunk(MemorySegment chunk) {
        // Process chunk - data stays in native memory
        long sum = 0;
        for (long i = 0; i < chunk.byteSize(); i += 4) {
            if (i + 4 <= chunk.byteSize()) {
                sum += chunk.get(ValueLayout.JAVA_INT, i);
            }
        }
        System.out.println("Chunk sum: " + sum);
    }
}

Vectorized Operations

SIMD-Friendly Memory Layout:

import java.lang.foreign.*;

public class VectorizedOperations {
    // Align data for SIMD operations
    private static final long VECTOR_SIZE = 32;  // 256-bit vectors

    public void vectorFriendlyLayout() {
        try (Arena arena = Arena.ofConfined()) {
            int elementCount = 1024;

            // Allocate with alignment for SIMD
            MemorySegment data = arena.allocate(
                elementCount * ValueLayout.JAVA_FLOAT.byteSize(),
                VECTOR_SIZE
            );

            // Initialize
            for (int i = 0; i < elementCount; i++) {
                data.set(ValueLayout.JAVA_FLOAT, i * 4L, (float) i);
            }

            // Process in vector-sized chunks
            processVectorized(data, elementCount);
        }
    }

    private void processVectorized(MemorySegment data, int count) {
        // Process 8 floats at a time (256-bit / 32-bit)
        int vectorLength = 8;
        float sum = 0;

        for (int i = 0; i < count; i += vectorLength) {
            // Load vector
            for (int j = 0; j < vectorLength && i + j < count; j++) {
                float value = data.get(ValueLayout.JAVA_FLOAT, (i + j) * 4L);
                sum += value;
            }
        }

        System.out.println("Sum: " + sum);
    }
}

Structure of Arrays (SoA) Pattern:

public class StructureOfArrays {
    // Instead of Array of Structures:
    // struct Point { float x, y, z; } points[1000];

    // Use Structure of Arrays for better cache locality:
    // float x[1000], y[1000], z[1000];

    public static class Points {
        private final MemorySegment xCoords;
        private final MemorySegment yCoords;
        private final MemorySegment zCoords;
        private final int count;

        public Points(Arena arena, int count) {
            this.count = count;
            long size = count * ValueLayout.JAVA_FLOAT.byteSize();

            // Separate arrays for each coordinate
            this.xCoords = arena.allocate(size);
            this.yCoords = arena.allocate(size);
            this.zCoords = arena.allocate(size);
        }

        public void set(int index, float x, float y, float z) {
            long offset = index * 4L;
            xCoords.set(ValueLayout.JAVA_FLOAT, offset, x);
            yCoords.set(ValueLayout.JAVA_FLOAT, offset, y);
            zCoords.set(ValueLayout.JAVA_FLOAT, offset, z);
        }

        public float getX(int index) {
            return xCoords.get(ValueLayout.JAVA_FLOAT, index * 4L);
        }

        public float getY(int index) {
            return yCoords.get(ValueLayout.JAVA_FLOAT, index * 4L);
        }

        public float getZ(int index) {
            return zCoords.get(ValueLayout.JAVA_FLOAT, index * 4L);
        }

        // Vectorized operation - process all X coordinates
        public float sumX() {
            float sum = 0;
            for (int i = 0; i < count; i++) {
                sum += xCoords.get(ValueLayout.JAVA_FLOAT, i * 4L);
            }
            return sum;
        }
    }

    public void demonstrateSoA() {
        try (Arena arena = Arena.ofConfined()) {
            Points points = new Points(arena, 1000);

            // Initialize points
            for (int i = 0; i < 1000; i++) {
                points.set(i, i * 1.0f, i * 2.0f, i * 3.0f);
            }

            // Vectorized sum of X coordinates
            float sumX = points.sumX();
            System.out.println("Sum of X: " + sumX);
        }
    }
}

Ring Buffer Implementation

import java.lang.foreign.*;

public class RingBuffer {
    private final Arena arena;
    private final MemorySegment buffer;
    private final long capacity;
    private long writePos = 0;
    private long readPos = 0;
    private long size = 0;

    public RingBuffer(int capacity) {
        this.arena = Arena.ofShared();
        this.capacity = capacity;
        this.buffer = arena.allocate(capacity);
    }

    public synchronized boolean write(byte[] data) {
        if (data.length > capacity - size) {
            return false;  // Buffer full
        }

        for (byte b : data) {
            buffer.set(ValueLayout.JAVA_BYTE, writePos, b);
            writePos = (writePos + 1) % capacity;
            size++;
        }

        return true;
    }

    public synchronized byte[] read(int length) {
        if (length > size) {
            length = (int) size;
        }

        byte[] result = new byte[length];

        for (int i = 0; i < length; i++) {
            result[i] = buffer.get(ValueLayout.JAVA_BYTE, readPos);
            readPos = (readPos + 1) % capacity;
            size--;
        }

        return result;
    }

    public synchronized long available() {
        return size;
    }

    public synchronized long remaining() {
        return capacity - size;
    }

    public void close() {
        arena.close();
    }

    // Usage example
    public static void main(String[] args) {
        RingBuffer ring = new RingBuffer(1024);

        // Producer
        Thread producer = new Thread(() -> {
            for (int i = 0; i < 100; i++) {
                byte[] data = ("Message " + i + "\n").getBytes();
                while (!ring.write(data)) {
                    try { Thread.sleep(1); } catch (InterruptedException e) {}
                }
            }
        });

        // Consumer
        Thread consumer = new Thread(() -> {
            for (int i = 0; i < 100; i++) {
                while (ring.available() == 0) {
                    try { Thread.sleep(1); } catch (InterruptedException e) {}
                }
                byte[] data = ring.read(20);
                System.out.print(new String(data));
            }
        });

        producer.start();
        consumer.start();

        try {
            producer.join();
            consumer.join();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }

        ring.close();
    }
}

Memory Pool for High-Frequency Allocations

import java.lang.foreign.*;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;

public class MemoryPool {
    private final Arena arena;
    private final Queue<MemorySegment> freeList = new ConcurrentLinkedQueue<>();
    private final long blockSize;
    private final int maxBlocks;
    private int allocatedBlocks = 0;

    public MemoryPool(long blockSize, int initialBlocks, int maxBlocks) {
        this.arena = Arena.ofShared();
        this.blockSize = blockSize;
        this.maxBlocks = maxBlocks;

        // Pre-allocate initial blocks
        for (int i = 0; i < initialBlocks; i++) {
            freeList.offer(arena.allocate(blockSize));
            allocatedBlocks++;
        }
    }

    public MemorySegment acquire() {
        MemorySegment segment = freeList.poll();

        if (segment == null) {
            synchronized (this) {
                if (allocatedBlocks < maxBlocks) {
                    segment = arena.allocate(blockSize);
                    allocatedBlocks++;
                } else {
                    throw new OutOfMemoryError("Pool exhausted");
                }
            }
        }

        // Clear segment before reuse
        segment.fill((byte) 0);

        return segment;
    }

    public void release(MemorySegment segment) {
        if (segment.byteSize() != blockSize) {
            throw new IllegalArgumentException("Invalid segment size");
        }

        freeList.offer(segment);
    }

    public int available() {
        return freeList.size();
    }

    public void close() {
        freeList.clear();
        arena.close();
    }
}

// Usage
class PooledProcessor {
    private final MemoryPool pool = new MemoryPool(4096, 10, 100);

    public void processData(byte[] data) {
        MemorySegment buffer = pool.acquire();
        try {
            // Copy data to pooled memory
            MemorySegment.copy(
                data, 0,
                buffer, ValueLayout.JAVA_BYTE, 0,
                data.length
            );

            // Process...

        } finally {
            pool.release(buffer);
        }
    }
}

Lock-Free Queue

import java.lang.foreign.*;
import java.util.concurrent.atomic.AtomicLong;

public class LockFreeQueue {
    private final Arena arena;
    private final MemorySegment buffer;
    private final long capacity;
    private final AtomicLong head = new AtomicLong(0);
    private final AtomicLong tail = new AtomicLong(0);
    private final long entrySize = 16;  // 8 bytes for data + 8 for metadata

    public LockFreeQueue(int capacity) {
        this.arena = Arena.ofShared();
        this.capacity = capacity;
        this.buffer = arena.allocate(capacity * entrySize);
    }

    public boolean offer(long value) {
        while (true) {
            long currentTail = tail.get();
            long currentHead = head.get();

            // Check if full
            if (currentTail - currentHead >= capacity) {
                return false;
            }

            long index = currentTail % capacity;
            long offset = index * entrySize;

            // Try to claim slot
            if (tail.compareAndSet(currentTail, currentTail + 1)) {
                // Write data
                buffer.set(ValueLayout.JAVA_LONG, offset, value);
                buffer.set(ValueLayout.JAVA_LONG, offset + 8, 1L);  // Mark ready
                return true;
            }
        }
    }

    public Long poll() {
        while (true) {
            long currentHead = head.get();
            long currentTail = tail.get();

            // Check if empty
            if (currentHead >= currentTail) {
                return null;
            }

            long index = currentHead % capacity;
            long offset = index * entrySize;

            // Wait until data is ready
            while (buffer.get(ValueLayout.JAVA_LONG, offset + 8) == 0) {
                Thread.onSpinWait();
            }

            // Try to claim slot
            if (head.compareAndSet(currentHead, currentHead + 1)) {
                long value = buffer.get(ValueLayout.JAVA_LONG, offset);
                buffer.set(ValueLayout.JAVA_LONG, offset + 8, 0L);  // Mark consumed
                return value;
            }
        }
    }

    public void close() {
        arena.close();
    }
}

Cache-Friendly Data Structures

Aligned Allocation for Cache Lines:

public class CacheAlignedStructures {
    private static final long CACHE_LINE_SIZE = 64;

    // Cache line padded counter
    public static class PaddedCounter {
        private final MemorySegment segment;

        public PaddedCounter(Arena arena) {
            // Allocate cache-line aligned
            this.segment = arena.allocate(CACHE_LINE_SIZE, CACHE_LINE_SIZE);
            segment.set(ValueLayout.JAVA_LONG, 0, 0L);
        }

        public void increment() {
            long current = segment.get(ValueLayout.JAVA_LONG, 0);
            segment.set(ValueLayout.JAVA_LONG, 0, current + 1);
        }

        public long get() {
            return segment.get(ValueLayout.JAVA_LONG, 0);
        }
    }

    // Array of cache-aligned counters (no false sharing)
    public static class CounterArray {
        private final MemorySegment counters;
        private final int count;

        public CounterArray(Arena arena, int count) {
            this.count = count;
            // Each counter gets its own cache line
            this.counters = arena.allocate(count * CACHE_LINE_SIZE, CACHE_LINE_SIZE);

            // Initialize
            for (int i = 0; i < count; i++) {
                counters.set(ValueLayout.JAVA_LONG, i * CACHE_LINE_SIZE, 0L);
            }
        }

        public void increment(int index) {
            long offset = index * CACHE_LINE_SIZE;
            long current = counters.get(ValueLayout.JAVA_LONG, offset);
            counters.set(ValueLayout.JAVA_LONG, offset, current + 1);
        }

        public long get(int index) {
            return counters.get(ValueLayout.JAVA_LONG, index * CACHE_LINE_SIZE);
        }

        public long sum() {
            long total = 0;
            for (int i = 0; i < count; i++) {
                total += get(i);
            }
            return total;
        }
    }
}

Performance Benchmarking

import java.lang.foreign.*;

public class MemoryBenchmark {
    public void benchmarkHeapVsOffHeap() {
        int iterations = 1_000_000;
        int size = 1024;

        // Heap allocation
        long heapStart = System.nanoTime();
        for (int i = 0; i < iterations; i++) {
            byte[] array = new byte[size];
            array[0] = (byte) i;
        }
        long heapTime = System.nanoTime() - heapStart;

        // Off-heap allocation
        long offHeapStart = System.nanoTime();
        try (Arena arena = Arena.ofConfined()) {
            for (int i = 0; i < iterations; i++) {
                MemorySegment segment = arena.allocate(size);
                segment.set(ValueLayout.JAVA_BYTE, 0, (byte) i);
            }
        }
        long offHeapTime = System.nanoTime() - offHeapStart;

        System.out.printf("Heap: %.2f ms\n", heapTime / 1_000_000.0);
        System.out.printf("Off-heap: %.2f ms\n", offHeapTime / 1_000_000.0);
    }

    public void benchmarkCopyOperations() {
        int size = 1024 * 1024;  // 1 MB
        byte[] srcArray = new byte[size];
        byte[] dstArray = new byte[size];

        try (Arena arena = Arena.ofConfined()) {
            MemorySegment srcSegment = arena.allocateArray(ValueLayout.JAVA_BYTE, srcArray);
            MemorySegment dstSegment = arena.allocate(size);

            // Benchmark bulk copy
            long start = System.nanoTime();
            MemorySegment.copy(srcSegment, 0, dstSegment, 0, size);
            long bulkTime = System.nanoTime() - start;

            // Benchmark element-by-element
            start = System.nanoTime();
            for (int i = 0; i < size; i++) {
                byte b = srcSegment.get(ValueLayout.JAVA_BYTE, i);
                dstSegment.set(ValueLayout.JAVA_BYTE, i, b);
            }
            long elementTime = System.nanoTime() - start;

            System.out.printf("Bulk copy: %.2f ms\n", bulkTime / 1_000_000.0);
            System.out.printf("Element copy: %.2f ms\n", elementTime / 1_000_000.0);
            System.out.printf("Speedup: %.1fx\n", (double) elementTime / bulkTime);
        }
    }
}

Best Practices

1. Align Critical Data:

// Good - cache-aligned for performance
MemorySegment aligned = arena.allocate(size, 64);

// Consider alignment for:
// - Atomic operations
// - SIMD processing
// - Avoiding false sharing

2. Minimize Copies:

// Good - zero-copy with memory-mapped file
MemorySegment mapped = channel.map(...);

// Bad - unnecessary copy
byte[] data = mapped.toArray(ValueLayout.JAVA_BYTE);

3. Use Pooling for Hot Paths:

// Good - reuse allocations
MemorySegment buffer = pool.acquire();
try {
    // Use buffer
} finally {
    pool.release(buffer);
}

// Bad - constant allocation/deallocation
for (int i = 0; i < 1000000; i++) {
    try (Arena arena = Arena.ofConfined()) {
        MemorySegment seg = arena.allocate(1024);
    }
}

4. Consider SoA for Vector Operations:

// Good - better cache locality for vector ops
float[] xCoords = ..., yCoords = ..., zCoords = ...;

// Bad - scattered access pattern
Point[] points = ...;  // struct { float x, y, z; }

5. Profile Before Optimizing:

// Measure actual performance impact
long start = System.nanoTime();
// ... operation ...
long duration = System.nanoTime() - start;

These advanced patterns enable high-performance memory operations for demanding applications.