12.3 Advanced Memory Patterns and Performance
Optimize memory-intensive operations with advanced patterns for zero-copy, vectorization, and efficient data structures.
Zero-Copy Patterns
Direct Buffer Integration:
import java.lang.foreign.*;
import java.nio.ByteBuffer;
public class ZeroCopyPatterns {
public void directBufferToSegment() {
// Allocate direct ByteBuffer
ByteBuffer buffer = ByteBuffer.allocateDirect(1024);
// Fill buffer
buffer.putInt(42);
buffer.putLong(123456789L);
buffer.flip();
// Wrap as MemorySegment (zero-copy)
MemorySegment segment = MemorySegment.ofBuffer(buffer);
// Read from segment (same underlying memory)
int value = segment.get(ValueLayout.JAVA_INT, 0);
long longValue = segment.get(ValueLayout.JAVA_LONG, 4);
System.out.println("Int: " + value + ", Long: " + longValue);
}
public void networkReceiveZeroCopy(java.nio.channels.SocketChannel socket)
throws Exception {
try (Arena arena = Arena.ofConfined()) {
// Allocate buffer
MemorySegment buffer = arena.allocate(8192);
// Get ByteBuffer view (zero-copy)
ByteBuffer byteBuffer = buffer.asByteBuffer();
// Receive directly into native memory
int bytesRead = socket.read(byteBuffer);
// Process data in native memory without copying
if (bytesRead > 0) {
processData(buffer.asSlice(0, bytesRead));
}
}
}
private void processData(MemorySegment data) {
// Process without additional copies
int messageType = data.get(ValueLayout.JAVA_INT, 0);
System.out.println("Message type: " + messageType);
}
}
Memory-Mapped File Zero-Copy:
import java.nio.channels.FileChannel;
import java.nio.file.*;
public class MappedFileZeroCopy {
public void processLargeFile(Path path) throws Exception {
try (FileChannel channel = FileChannel.open(path, StandardOpenOption.READ);
Arena arena = Arena.ofShared()) {
long fileSize = channel.size();
// Map entire file (zero-copy access)
MemorySegment mapped = channel.map(
FileChannel.MapMode.READ_ONLY,
0,
fileSize,
arena
);
// Process file in chunks without loading into heap
long chunkSize = 1024 * 1024; // 1 MB chunks
long offset = 0;
while (offset < fileSize) {
long remaining = fileSize - offset;
long currentChunk = Math.min(chunkSize, remaining);
MemorySegment chunk = mapped.asSlice(offset, currentChunk);
processChunk(chunk);
offset += currentChunk;
}
}
}
private void processChunk(MemorySegment chunk) {
// Process chunk - data stays in native memory
long sum = 0;
for (long i = 0; i < chunk.byteSize(); i += 4) {
if (i + 4 <= chunk.byteSize()) {
sum += chunk.get(ValueLayout.JAVA_INT, i);
}
}
System.out.println("Chunk sum: " + sum);
}
}
Vectorized Operations
SIMD-Friendly Memory Layout:
import java.lang.foreign.*;
public class VectorizedOperations {
// Align data for SIMD operations
private static final long VECTOR_SIZE = 32; // 256-bit vectors
public void vectorFriendlyLayout() {
try (Arena arena = Arena.ofConfined()) {
int elementCount = 1024;
// Allocate with alignment for SIMD
MemorySegment data = arena.allocate(
elementCount * ValueLayout.JAVA_FLOAT.byteSize(),
VECTOR_SIZE
);
// Initialize
for (int i = 0; i < elementCount; i++) {
data.set(ValueLayout.JAVA_FLOAT, i * 4L, (float) i);
}
// Process in vector-sized chunks
processVectorized(data, elementCount);
}
}
private void processVectorized(MemorySegment data, int count) {
// Process 8 floats at a time (256-bit / 32-bit)
int vectorLength = 8;
float sum = 0;
for (int i = 0; i < count; i += vectorLength) {
// Load vector
for (int j = 0; j < vectorLength && i + j < count; j++) {
float value = data.get(ValueLayout.JAVA_FLOAT, (i + j) * 4L);
sum += value;
}
}
System.out.println("Sum: " + sum);
}
}
Structure of Arrays (SoA) Pattern:
public class StructureOfArrays {
// Instead of Array of Structures:
// struct Point { float x, y, z; } points[1000];
// Use Structure of Arrays for better cache locality:
// float x[1000], y[1000], z[1000];
public static class Points {
private final MemorySegment xCoords;
private final MemorySegment yCoords;
private final MemorySegment zCoords;
private final int count;
public Points(Arena arena, int count) {
this.count = count;
long size = count * ValueLayout.JAVA_FLOAT.byteSize();
// Separate arrays for each coordinate
this.xCoords = arena.allocate(size);
this.yCoords = arena.allocate(size);
this.zCoords = arena.allocate(size);
}
public void set(int index, float x, float y, float z) {
long offset = index * 4L;
xCoords.set(ValueLayout.JAVA_FLOAT, offset, x);
yCoords.set(ValueLayout.JAVA_FLOAT, offset, y);
zCoords.set(ValueLayout.JAVA_FLOAT, offset, z);
}
public float getX(int index) {
return xCoords.get(ValueLayout.JAVA_FLOAT, index * 4L);
}
public float getY(int index) {
return yCoords.get(ValueLayout.JAVA_FLOAT, index * 4L);
}
public float getZ(int index) {
return zCoords.get(ValueLayout.JAVA_FLOAT, index * 4L);
}
// Vectorized operation - process all X coordinates
public float sumX() {
float sum = 0;
for (int i = 0; i < count; i++) {
sum += xCoords.get(ValueLayout.JAVA_FLOAT, i * 4L);
}
return sum;
}
}
public void demonstrateSoA() {
try (Arena arena = Arena.ofConfined()) {
Points points = new Points(arena, 1000);
// Initialize points
for (int i = 0; i < 1000; i++) {
points.set(i, i * 1.0f, i * 2.0f, i * 3.0f);
}
// Vectorized sum of X coordinates
float sumX = points.sumX();
System.out.println("Sum of X: " + sumX);
}
}
}
Ring Buffer Implementation
import java.lang.foreign.*;
public class RingBuffer {
private final Arena arena;
private final MemorySegment buffer;
private final long capacity;
private long writePos = 0;
private long readPos = 0;
private long size = 0;
public RingBuffer(int capacity) {
this.arena = Arena.ofShared();
this.capacity = capacity;
this.buffer = arena.allocate(capacity);
}
public synchronized boolean write(byte[] data) {
if (data.length > capacity - size) {
return false; // Buffer full
}
for (byte b : data) {
buffer.set(ValueLayout.JAVA_BYTE, writePos, b);
writePos = (writePos + 1) % capacity;
size++;
}
return true;
}
public synchronized byte[] read(int length) {
if (length > size) {
length = (int) size;
}
byte[] result = new byte[length];
for (int i = 0; i < length; i++) {
result[i] = buffer.get(ValueLayout.JAVA_BYTE, readPos);
readPos = (readPos + 1) % capacity;
size--;
}
return result;
}
public synchronized long available() {
return size;
}
public synchronized long remaining() {
return capacity - size;
}
public void close() {
arena.close();
}
// Usage example
public static void main(String[] args) {
RingBuffer ring = new RingBuffer(1024);
// Producer
Thread producer = new Thread(() -> {
for (int i = 0; i < 100; i++) {
byte[] data = ("Message " + i + "\n").getBytes();
while (!ring.write(data)) {
try { Thread.sleep(1); } catch (InterruptedException e) {}
}
}
});
// Consumer
Thread consumer = new Thread(() -> {
for (int i = 0; i < 100; i++) {
while (ring.available() == 0) {
try { Thread.sleep(1); } catch (InterruptedException e) {}
}
byte[] data = ring.read(20);
System.out.print(new String(data));
}
});
producer.start();
consumer.start();
try {
producer.join();
consumer.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
ring.close();
}
}
Memory Pool for High-Frequency Allocations
import java.lang.foreign.*;
import java.util.Queue;
import java.util.concurrent.ConcurrentLinkedQueue;
public class MemoryPool {
private final Arena arena;
private final Queue<MemorySegment> freeList = new ConcurrentLinkedQueue<>();
private final long blockSize;
private final int maxBlocks;
private int allocatedBlocks = 0;
public MemoryPool(long blockSize, int initialBlocks, int maxBlocks) {
this.arena = Arena.ofShared();
this.blockSize = blockSize;
this.maxBlocks = maxBlocks;
// Pre-allocate initial blocks
for (int i = 0; i < initialBlocks; i++) {
freeList.offer(arena.allocate(blockSize));
allocatedBlocks++;
}
}
public MemorySegment acquire() {
MemorySegment segment = freeList.poll();
if (segment == null) {
synchronized (this) {
if (allocatedBlocks < maxBlocks) {
segment = arena.allocate(blockSize);
allocatedBlocks++;
} else {
throw new OutOfMemoryError("Pool exhausted");
}
}
}
// Clear segment before reuse
segment.fill((byte) 0);
return segment;
}
public void release(MemorySegment segment) {
if (segment.byteSize() != blockSize) {
throw new IllegalArgumentException("Invalid segment size");
}
freeList.offer(segment);
}
public int available() {
return freeList.size();
}
public void close() {
freeList.clear();
arena.close();
}
}
// Usage
class PooledProcessor {
private final MemoryPool pool = new MemoryPool(4096, 10, 100);
public void processData(byte[] data) {
MemorySegment buffer = pool.acquire();
try {
// Copy data to pooled memory
MemorySegment.copy(
data, 0,
buffer, ValueLayout.JAVA_BYTE, 0,
data.length
);
// Process...
} finally {
pool.release(buffer);
}
}
}
Lock-Free Queue
import java.lang.foreign.*;
import java.util.concurrent.atomic.AtomicLong;
public class LockFreeQueue {
private final Arena arena;
private final MemorySegment buffer;
private final long capacity;
private final AtomicLong head = new AtomicLong(0);
private final AtomicLong tail = new AtomicLong(0);
private final long entrySize = 16; // 8 bytes for data + 8 for metadata
public LockFreeQueue(int capacity) {
this.arena = Arena.ofShared();
this.capacity = capacity;
this.buffer = arena.allocate(capacity * entrySize);
}
public boolean offer(long value) {
while (true) {
long currentTail = tail.get();
long currentHead = head.get();
// Check if full
if (currentTail - currentHead >= capacity) {
return false;
}
long index = currentTail % capacity;
long offset = index * entrySize;
// Try to claim slot
if (tail.compareAndSet(currentTail, currentTail + 1)) {
// Write data
buffer.set(ValueLayout.JAVA_LONG, offset, value);
buffer.set(ValueLayout.JAVA_LONG, offset + 8, 1L); // Mark ready
return true;
}
}
}
public Long poll() {
while (true) {
long currentHead = head.get();
long currentTail = tail.get();
// Check if empty
if (currentHead >= currentTail) {
return null;
}
long index = currentHead % capacity;
long offset = index * entrySize;
// Wait until data is ready
while (buffer.get(ValueLayout.JAVA_LONG, offset + 8) == 0) {
Thread.onSpinWait();
}
// Try to claim slot
if (head.compareAndSet(currentHead, currentHead + 1)) {
long value = buffer.get(ValueLayout.JAVA_LONG, offset);
buffer.set(ValueLayout.JAVA_LONG, offset + 8, 0L); // Mark consumed
return value;
}
}
}
public void close() {
arena.close();
}
}
Cache-Friendly Data Structures
Aligned Allocation for Cache Lines:
public class CacheAlignedStructures {
private static final long CACHE_LINE_SIZE = 64;
// Cache line padded counter
public static class PaddedCounter {
private final MemorySegment segment;
public PaddedCounter(Arena arena) {
// Allocate cache-line aligned
this.segment = arena.allocate(CACHE_LINE_SIZE, CACHE_LINE_SIZE);
segment.set(ValueLayout.JAVA_LONG, 0, 0L);
}
public void increment() {
long current = segment.get(ValueLayout.JAVA_LONG, 0);
segment.set(ValueLayout.JAVA_LONG, 0, current + 1);
}
public long get() {
return segment.get(ValueLayout.JAVA_LONG, 0);
}
}
// Array of cache-aligned counters (no false sharing)
public static class CounterArray {
private final MemorySegment counters;
private final int count;
public CounterArray(Arena arena, int count) {
this.count = count;
// Each counter gets its own cache line
this.counters = arena.allocate(count * CACHE_LINE_SIZE, CACHE_LINE_SIZE);
// Initialize
for (int i = 0; i < count; i++) {
counters.set(ValueLayout.JAVA_LONG, i * CACHE_LINE_SIZE, 0L);
}
}
public void increment(int index) {
long offset = index * CACHE_LINE_SIZE;
long current = counters.get(ValueLayout.JAVA_LONG, offset);
counters.set(ValueLayout.JAVA_LONG, offset, current + 1);
}
public long get(int index) {
return counters.get(ValueLayout.JAVA_LONG, index * CACHE_LINE_SIZE);
}
public long sum() {
long total = 0;
for (int i = 0; i < count; i++) {
total += get(i);
}
return total;
}
}
}
Performance Benchmarking
import java.lang.foreign.*;
public class MemoryBenchmark {
public void benchmarkHeapVsOffHeap() {
int iterations = 1_000_000;
int size = 1024;
// Heap allocation
long heapStart = System.nanoTime();
for (int i = 0; i < iterations; i++) {
byte[] array = new byte[size];
array[0] = (byte) i;
}
long heapTime = System.nanoTime() - heapStart;
// Off-heap allocation
long offHeapStart = System.nanoTime();
try (Arena arena = Arena.ofConfined()) {
for (int i = 0; i < iterations; i++) {
MemorySegment segment = arena.allocate(size);
segment.set(ValueLayout.JAVA_BYTE, 0, (byte) i);
}
}
long offHeapTime = System.nanoTime() - offHeapStart;
System.out.printf("Heap: %.2f ms\n", heapTime / 1_000_000.0);
System.out.printf("Off-heap: %.2f ms\n", offHeapTime / 1_000_000.0);
}
public void benchmarkCopyOperations() {
int size = 1024 * 1024; // 1 MB
byte[] srcArray = new byte[size];
byte[] dstArray = new byte[size];
try (Arena arena = Arena.ofConfined()) {
MemorySegment srcSegment = arena.allocateArray(ValueLayout.JAVA_BYTE, srcArray);
MemorySegment dstSegment = arena.allocate(size);
// Benchmark bulk copy
long start = System.nanoTime();
MemorySegment.copy(srcSegment, 0, dstSegment, 0, size);
long bulkTime = System.nanoTime() - start;
// Benchmark element-by-element
start = System.nanoTime();
for (int i = 0; i < size; i++) {
byte b = srcSegment.get(ValueLayout.JAVA_BYTE, i);
dstSegment.set(ValueLayout.JAVA_BYTE, i, b);
}
long elementTime = System.nanoTime() - start;
System.out.printf("Bulk copy: %.2f ms\n", bulkTime / 1_000_000.0);
System.out.printf("Element copy: %.2f ms\n", elementTime / 1_000_000.0);
System.out.printf("Speedup: %.1fx\n", (double) elementTime / bulkTime);
}
}
}
Best Practices
1. Align Critical Data:
// Good - cache-aligned for performance
MemorySegment aligned = arena.allocate(size, 64);
// Consider alignment for:
// - Atomic operations
// - SIMD processing
// - Avoiding false sharing
2. Minimize Copies:
// Good - zero-copy with memory-mapped file
MemorySegment mapped = channel.map(...);
// Bad - unnecessary copy
byte[] data = mapped.toArray(ValueLayout.JAVA_BYTE);
3. Use Pooling for Hot Paths:
// Good - reuse allocations
MemorySegment buffer = pool.acquire();
try {
// Use buffer
} finally {
pool.release(buffer);
}
// Bad - constant allocation/deallocation
for (int i = 0; i < 1000000; i++) {
try (Arena arena = Arena.ofConfined()) {
MemorySegment seg = arena.allocate(1024);
}
}
4. Consider SoA for Vector Operations:
// Good - better cache locality for vector ops
float[] xCoords = ..., yCoords = ..., zCoords = ...;
// Bad - scattered access pattern
Point[] points = ...; // struct { float x, y, z; }
5. Profile Before Optimizing:
// Measure actual performance impact
long start = System.nanoTime();
// ... operation ...
long duration = System.nanoTime() - start;
These advanced patterns enable high-performance memory operations for demanding applications.