• Optimizing AMD Memory Bandwith: this is an interesting article by John McCalpin on optimizing single-thread memory bandwidth on Operton systems. The running example sums the elements of an array. The initial simple program is transformed in a sequence of stages. The hoops one has to jump through to get the final performance numbers are impressive.

Simple Intra-node Benchmarks

The purpose of these benchmarks is to measure the time it takes to move data from unit of memory to another within the same node. The benchmarks do not use any "high-level" libraries, like MPI or OpenMP. The goal is to find out the maximum capabilities of the machine. The source codes are available by SVN. Contact Steve Siegel to obtain access.

Common code

This header file is included in all the benchmarks. It is located in directory include. The implementation is in common/bench.c

/* bench.h: Header files for common benchmark routines.
 * Author: Stephen F. Siegel
 * Last Modified: June, 2011
#define _XOPEN_SOURCE 600
#define _GNU_SOURCE
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <pthread.h>
#include <stdlib.h>
#include <sched.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/syscall.h>
#include <sys/time.h>

/* Sets the CPU affinity of the calling thread to the given CPU.
 * The tid is used for diagnostic output only.  */
void setAffinity(int tid, int cpuid);

/* Print some information about the calling thread.
 * The tid should be the Pthreads thread id, but is not used except
 * that it is printed.  */
void printHello(int tid);

/* Invokes a Pthreads barrier on all threads. */
void callBarrier(int tid, pthread_barrier_t *barrier);

 * Returns the elapsed time in seconds, given two clock readings.
 * To get a time reading: struct timeval tv;  gettimeofday(&tv, NULL);
 * Fields of timeval:
 * long int tv_sec:
 *   This represents the number of seconds since the epoch.
 *   It is equivalent to a normal time_t value.
 * long int tv_usec:
 *   This is the fractional second value, represented as the
 *   number of microseconds.
double elapsedTime(struct timeval tv1, struct timeval tv2);

Benchmark p2p: point-to-point bandwidth test

/* p2p.c: Linux multicore bandwidth test.  Measures time it
 * takes to send data from memory allocated on core i to memory
 * allocated on core j.
 * Author: Stephen F. Siegel
 * Last Modified: June, 2011
 * Warning: non-portable code!  Linux only!! */
#include "bench.h"

/* This many threads will be instantiated.  One thread will be assigned
 * to each core (or "CPU"). */
#define NUM_THREADS 24

/* One mega-byte. */
#define MB 1000000L

/* The size of a buffer that will be transmitted in one call to memcpy.
 * Must be a multiple of sizeof(double). */
#define BUFFER_SIZE 500L*MB

/* The number of times the memcpy will be repeated to try to get a more
 * accurate timing. */
#define REPEAT 2

/* One buffer for each thread. Each buffer will be allocated
 * BUFFER_SIZE bytes.   */
double *buffers[NUM_THREADS];

/* Used to create a global barrier */
pthread_barrier_t barr;

/* Allocates the send buffer or receive buffer for the calling thread.
void allocateBuffer(int tid) {
  size_t i, numElements = BUFFER_SIZE/sizeof(double);

  assert(tid >= 0 && tid < NUM_THREADS);
  buffers[tid] = (double*)calloc(numElements, sizeof(double));
  if (!buffers[tid]) {
    printf("calloc of buffer failed on thread %d\n", tid);

void initBuffer(int tid) {
  size_t i, numElements = BUFFER_SIZE/sizeof(double);

#ifdef DEBUG
  printf("Thread %d: initializing buffer...\n", tid);
  for (i=0; i<numElements; i++)
    buffers[tid][i] = -1.0; /* does it matter? */

/* Frees the buffer pointed to by buffers[tid] */
void freeBuffer(int tid) {

/* Executes the bandwidth test. */
void doTest(int tid) {
  double dataInMegaBytes = (REPEAT*BUFFER_SIZE)*1.0/(MB*1.0);
  int p, i;

  for (p=1; p<NUM_THREADS; p++) {
    struct timeval t0;
    int repeat;

      if (tid == 0 || tid == p) initBuffer(tid);
      callBarrier(tid, &barr);
    if (tid == 0) gettimeofday(&t0,NULL);
    for (repeat = 0; repeat < REPEAT; repeat++) {
      if (tid == 0) {
#ifdef DEBUG
	printf("Thread 0: copying %d MB to thread %d...\n",
	       BUFFER_SIZE/MB, p);
	memcpy(buffers[p], buffers[0], BUFFER_SIZE);
      callBarrier(tid, &barr);
    } /* end repeat loop */
    if (tid == 0) {
      struct timeval t1;
      double timeInSeconds, bandwidth;

      gettimeofday(&t1, NULL);
      timeInSeconds = elapsedTime(t0,t1);
      bandwidth = dataInMegaBytes/timeInSeconds;
      printf("0->%d: time(s):%f, data(MB):%f, bandwidth(MB/s):%f\n",
	     p, timeInSeconds, dataInMegaBytes, bandwidth);
  callBarrier(tid, &barr);

/* This function is run by each thread when the thread is instantiated. */
void *run(void *tid_pointer) {
  long tid_long = (long)tid_pointer;
  int tid = (int)tid_long;

  setAffinity(tid, tid);
  return (void*)0; /* to satisfy compiler */

/* Main function: spawns the threads.   No commandline args used.  */
int main (int argc, char *argv[]) {
  pthread_t threads[NUM_THREADS];
  int rc;
  long tid;

  assert(BUFFER_SIZE%sizeof(double) == 0);
  if(pthread_barrier_init(&barr, NULL, NUM_THREADS)) {
    printf("main: could not create a barrier\n");
    return -1;
  for (tid=0; tid<NUM_THREADS; tid++){
#ifdef DEBUG
    printf("main: creating thread %ld\n", tid);
    rc = pthread_create(&threads[tid], NULL, run, (void*)tid);
    if (rc) {
      printf("ERROR; return code from pthread_create() is %d\n", rc);
  for (tid=0; tid<NUM_THREADS; tid++)
    pthread_join(threads[tid], NULL);