Dvbmonkey’s Blog

March 6, 2009

Simple Speed Tests

Filed under: Uncategorized — dvbmonkey @ 1:01 pm

OCZ 30GB SSD

$ hdparm -Tt /dev/sdb
/dev/sdb:
 Timing cached reads:   3358 MB in  2.00 seconds = 1679.46 MB/sec
 Timing buffered disk reads:  452 MB in  3.01 seconds = 150.05 MB/sec

Western Digital WD10EADS 1TB

$ hdparm -Tt /dev/sdb
/dev/sdb:
 Timing cached reads:   3372 MB in  2.00 seconds = 1685.85 MB/sec
 Timing buffered disk reads:  274 MB in  3.00 seconds =  91.23 MB/sec

Hitachi HDT721010SLA360 1TB

$ hdparm -Tt /dev/sdb
/dev/sdb:
 Timing cached reads:   3360 MB in  2.00 seconds = 1680.70 MB/sec
 Timing buffered disk reads:  324 MB in  3.01 seconds = 107.63 MB/sec

Hitachi Deskstar HDP725032GLA360 320GB

$ hdparm -Tt /dev/sdb
/dev/sdb:
 Timing cached reads:   3304 MB in  2.00 seconds = 1652.26 MB/sec
 Timing buffered disk reads:  252 MB in  3.01 seconds =  83.62 MB/sec
Advertisements

March 3, 2009

Open MPI Master & Servant Example – BogoMips

Filed under: linux — dvbmonkey @ 12:44 pm
Tags: , ,

In yesterdays post I introduced a simple ‘master & servant’ technique where I used the rank-0 node to collate results from all the other nodes. To do this I used the methods MPI_Send and MPI_Recv to send/recv 128-byte MPI_CHAR strings. Today I am extending the example by sending/receiving MPI_FLOAT‘s to demonstrate that native C/C++ numerical values can be passed between nodes easily.

What are BogoMips?

From the Wikipedia article, BogoMips are an “unscientific measurement of CPU speed made by the Linux kernel when it boots, to calibrate an internal busy-loop”. If you’ve used Linux for some time you may have noticed the “BogoMips” value seen during the boot-up console messages. Alternatively, you can cat /proc/cpuinfo to see the values your Linux Kernel has calculated during boot.

Example: bogomips.c

Based on the Linux kernel code in init/main.c and include/linux/delay.h and the example ‘Standalone BogoMips’ code by Jeff Tranter. Here is a really simple ‘MPI BogoMips’ calculation, where each node takes an average of 10 BogoMips calculations for itself and MPI_Send‘s the result to the rank-0 node which sum’s them up and prints the total.

/*
 * Based on code Linux kernel code in init/main.c and include/linux/delay.h
 * and the example code by Jeff Tranter (Jeff_Tranter@Mitel.COM)
 */

#include 
#include 
#include 

// #define PORTABLE_BOGOMIPS
#define CLASSIC_BOGOMIPS

#ifdef CLASSIC_BOGOMIPS
    /* the original code from the Linux kernel */
    int HZ = 100;

    #define rdtscl(low) \
         __asm__ __volatile__ ("rdtsc" : "=a" (low) : : "edx")

    //This delay() is the one used on x86's with TSC after 2.2.14.
    //It won't work on a non TSC x86, period.
    void __inline__ delay(unsigned long loops)
    {
        unsigned long bclock, now;
        rdtscl(bclock);
        do {
    	rdtscl(now);
        }
        while ((now - bclock) < loops);
    }
#endif

#ifdef PORTABLE_BOGOMIPS
    /* portable version */
    static void delay(int loops)
    {
        long i;
        for (i = loops; i >= 0; i--);
    }
#endif

/* this should be approx 2 Bo*oMips to start (note initial shift), and will
 *    still work even if initially too large, it will just take slightly longer */
unsigned long loops_per_jiffy = (1 << 12);

/* This is the number of bits of precision for the loops_per_jiffy.  Each
 *    bit takes on average 1.5/HZ seconds.  This (like the original) is a little
 *       better than 1% */
#define LPS_PREC 8

int numprocs, rank, namelen;
char processor_name[MPI_MAX_PROCESSOR_NAME];

//plagiarized straight from the 2.4 sources.
float calibrate_delay(void)
{
    unsigned long ticks, loopbit;
    int lps_precision = LPS_PREC;
    loops_per_jiffy = (1 << 12);
    while (loops_per_jiffy <<= 1) {
	ticks = clock();
	while (ticks == clock())
	    /* nothing */ ;
	ticks = clock();
	delay(loops_per_jiffy);
	ticks = clock() - ticks;
	if (ticks)
	    break;
    }
    loops_per_jiffy >>= 1;
    loopbit = loops_per_jiffy;
    while (lps_precision-- && (loopbit >>= 1)) {
	loops_per_jiffy |= loopbit;
	ticks = clock();
	while (ticks == clock());
	ticks = clock();
	delay(loops_per_jiffy);
	if (clock() != ticks)
	    loops_per_jiffy &= ~loopbit;
    }
    return (loops_per_jiffy / (500000/HZ)) + (float)((loops_per_jiffy/(5000/HZ))%100) / (float)100;
}

int main(int argc, char *argv[])
{
    unsigned long loops_per_sec = 1;
    unsigned long ticks;
    int i;

    MPI_Status stat;
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Get_processor_name(processor_name, &namelen);

    float bogomips = 0;
    for ( i = 0; i < 9; i++ ) {
        bogomips += calibrate_delay();
    }
    bogomips = bogomips / (float) 10;

    printf( "[%02d/%02d %s] returned = %f BogoMips\n", rank, numprocs, processor_name, bogomips );

    if ( rank == 0 ) {
        float totalBogomips = bogomips;
        for ( i = 1; i < numprocs; i++ ) {
            float f = 0;
            MPI_Recv(&f, 1, MPI_FLOAT, i, 0, MPI_COMM_WORLD, &stat);
            totalBogomips += f;
        }
        printf( "Total = %f BogoMips\n", totalBogomips );
    } else {
        MPI_Send(&bogomips, 1, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);
    }

    MPI_Finalize();
    return 0;
}

The result of running this should look something like this:

[00/18 mpinode01] 2293.593018 BogoMips
[01/18 mpinode01] 2147.446045 BogoMips
[02/18 mpinode01] 2230.513916 BogoMips
[03/18 mpinode01] 2473.651855 BogoMips
[04/18 mpinode02] 3659.688721 BogoMips
[05/18 mpinode02] 4057.167236 BogoMips
[06/18 mpinode02] 4067.651123 BogoMips
[07/18 mpinode02] 4419.580078 BogoMips
[08/18 mpinode03] 2368.138916 BogoMips
[09/18 mpinode03] 3327.585938 BogoMips
[10/18 mpinode03] 3277.451904 BogoMips
[11/18 mpinode03] 3130.323975 BogoMips
[12/18 mpinode04] 2940.759766 BogoMips
[13/18 mpinode04] 3207.983154 BogoMips
[14/18 mpinode04] 4362.892090 BogoMips
[15/18 mpinode04] 3313.822998 BogoMips
[16/18 mpinode05] 2390.749023 BogoMips
[17/18 mpinode05] 3017.437012 BogoMips
Total = 56686.441406 BogoMips

🙂

March 2, 2009

An Open MPI Master & Servant Example

Filed under: linux — dvbmonkey @ 2:26 pm
Tags: , ,

Building on the Getting started… post from last week I’ve knocked up a quick example showing one way to get your MPI processes to communicate with one another.

master_servant.c:

#include <stdio.h>
#include <mpi.h>
#include <unistd.h>

int main(int argc, char *argv[]) {
   int numprocs, rank, namelen;
   char processor_name[MPI_MAX_PROCESSOR_NAME];

   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   MPI_Get_processor_name(processor_name, &namelen);

   if ( rank == 0 ) {
      printf( "[%02d/%02d %s]: I am the master\n", rank, numprocs, processor_name );
      // Tell the servants to do something
   } else {
      printf( "[%02d/%02d %s]: I am a servant\n", rank, numprocs, processor_name );
      // Wait for something to do
   }

   MPI_Finalize();
}

Build this with mpicc master_servant.c -o master_servant and run it, you should get something like:

[00/08 mpinode01]: I am the master
[01/08 mpinode01]: I am a servant
[02/08 mpinode01]: I am a servant
[03/08 mpinode01]: I am a servant
[04/08 mpinode02]: I am a servant
[05/08 mpinode02]: I am a servant
[06/08 mpinode02]: I am a servant
[07/08 mpinode02]: I am a servant

Ok, this means that based on the rank returned by MPI_Comm_rank we can decide which instance of the program is going to act as the “master” and which instance(s) are going to act as “servants” – pretty neat!

Next example, we build on this by getting the program instances to communicate with one another. Borrowed from the example found here.

master_servant2.c:

#include <stdio.h>
#include <string.h>
#include <mpi.h>
#include <unistd.h>

int main(int argc, char *argv[]) {
   char idstr[32], buff[128];
   int numprocs, rank, namelen, i;
   char processor_name[MPI_MAX_PROCESSOR_NAME];

   MPI_Status stat;
   MPI_Init(&argc, &argv);
   MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
   MPI_Get_processor_name(processor_name, &namelen);

   // Based on example from https://wiki.inf.ed.ac.uk/pub/ANC/ComputationalResources/slides.pdf
   if (rank == 0) {
      // This is the rank-0 copy of the process
      printf("We have %d processors\n", numprocs);
      // Send each process a "Hello ... " string
      for(i = 1; i < numprocs; i++) {
         sprintf(buff, "Hello %d... ", i);
         MPI_Send(buff, 128, MPI_CHAR, i, 0, MPI_COMM_WORLD);
      }
      // Go into a blocking-receive for each servant process
      for(i = 1; i < numprocs; i++) {
         MPI_Recv(buff, 128, MPI_CHAR, i, 0, MPI_COMM_WORLD, &stat);
         printf("%s\n", buff);
      }
   } else {
      // Go into a blocking-receive waiting
      MPI_Recv(buff, 128, MPI_CHAR, 0, 0, MPI_COMM_WORLD, &stat);
      // Append our identity onto the received string
      sprintf(idstr, "Processor %d ", rank);
      strcat(buff, idstr);
      strcat(buff, "reporting!");
      // Send the string back to the rank-0 process
      MPI_Send(buff, 128, MPI_CHAR, 0, 0, MPI_COMM_WORLD);
   }

   MPI_Finalize();
}

Build this example with mpicc master_servant2.c -o master_servant2 and run it, you should get the following output:

We have 8 processors
Hello 1... Processor 1 reporting!
Hello 2... Processor 2 reporting!
Hello 3... Processor 3 reporting!
Hello 4... Processor 4 reporting!
Hello 5... Processor 5 reporting!
Hello 6... Processor 6 reporting!
Hello 7... Processor 7 reporting!
Hello 8... Processor 8 reporting!

Now you can use this master/servant technique to partition work across instances of your MPI-capable program.

Troubleshooting

If you get an error like this “[hostname][0,1,0][btl_tcp_endpoint.c:572:mca_btl_tcp_endpoint_complete_connect] connect() failed with errno=113” try shutting down iptables on the MPI nodes. It was a quick-fix for me, i am sure there is a ‘proper’ way to configure it though. Keep in mind its probably not a good idea to switch off iptables on a machine if its connected to the open internet, the machines I have used in this guide are all on an internal network.

Blog at WordPress.com.