Mailing Lists: Apple Mailing Lists
Image of Mac OS face in stamp
RE: Using CHUD Framework API
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

RE: Using CHUD Framework API

Diarmuid Corcoran wrote:

> ... PPC performance monitor registers (7447A). Some of these events
> cause the 32 bit PMC registers to overflow within a matter of
> seconds (ie total L1 cache hits, processor cycles...) so I was
> wondering is there any means built into the api to handle this kind
> of overflow.

Here's some code I used to implement a 64-bit counter for CPU cycles on the
970 CPU.  You'll have to adapt it for the 7447A, if possible.

				-- edp (Eric Postpischil)

This code was intended to read CPU cycles, so there are references to that
and time in the code, but it can be adapted to read other events.

This code sets PMC 1 to event 15, to count CPU cycles, and it sets PMC 2 to
event 10, which counts overflows from PMC 1.  When this is done, PMC 2 and
PMC 1 together form a 64-bit counter of CPU cycles.

Note that the performance monitor counter register numbers (771 and 772
here) and the event numbers are CPU-specific.  Do not expect this code to
work unaltered on CPUs other than the 970.  Other CPUs might not have an
event, so this sort of 64-bit counting might not be possible.

Also, this code was extracted from a larger program -- I have deleted a
number of other things that were woven in with it, and I have not tested the
result.  I could have broken something while editing.

GCC-specific code is used to read the registers, notably the asm constructs.

#include <stdbool.h>
#include <stdint.h>

typedef struct { uint32_t upper, lower; } ClockValue;

// Return the current value of the clock.
static inline ClockValue ReadClock(void)
     /*    We must read the time as two values from some source, but we
         read them simultaneously, so the low value might roll over while we
         are reading them.  We will read the upper value twice and choose a
         correct one afterward -- see below.
     ClockValue result;
     uint32_t upper0, upper1;

     /*    Get values from the upper and lower count registers.

         The lower register might overflow into the upper register while we
         are doing this, so we will read the upper register twice.  We will
         assume the overflow can occur only once during these three
         instructions, so one of the two values of the upper register is
         correct.  If the lower value is small, it will not overflow for a
         while, so the upper value we read immediately after it must be
         correct.  Conversely, if the lower value is large, it must not have
         overflowed recently, so the upper value we read immediately before
         it must be correct.

     // Read values from performance monitor counters.
     __asm__ volatile("\
             mfspr    %[upper0], 772    \n\
             mfspr    %[lower] , 771    \n\
             mfspr    %[upper1], 772    "
         :    [lower]  "=r" (result.lower),    // result.lower is output.
             [upper0] "=r" (upper0),            // upper0 is output.
             [upper1] "=r" (upper1)            // upper1 is output.

     /*    Choose which upper value to use.  We could do this with a
         conditional expression:

             result.upper = result.lower < 2147483648u
                 ? upper1
                 : upper0;

         However, the execution time might change depending on whether the
         branch were correctly predicted or not.  Instead, we will use a
         calculation with no branches.

     // Use a signed shift to copy lower's high bit to all 32 bits.
     uint32_t mask = (int32_t) result.lower >> 31;

     result.upper = upper1 ^ ((upper0 ^ upper1) & mask);
         /*    If mask is all zeroes, the above statement reduces:

                 result.upper = upper1 ^ ((upper0 ^ upper1) & mask);
                 result.upper = upper1 ^ ((upper0 ^ upper1) & 0);
                 result.upper = upper1 ^ (0);
                 result.upper = upper1;

             If mask is all ones, the above statement reduces:

                 result.upper = upper1 ^ ((upper0 ^ upper1) & mask);
                 result.upper = upper1 ^ ((upper0 ^ upper1) & ~0);
                 result.upper = upper1 ^ (upper0 ^ upper1);
                 result.upper = upper0;

     return result;

/* Subtract two clock values, t1 and t0, and return t1-t0.

    Since some ClockValue implementations are unsigned, t1 should be not
    than t0.
static ClockValue SubtractClock(const ClockValue t1, const ClockValue t0)
     ClockValue result;

     result.upper = t1.upper - t0.upper;
     result.lower = t1.lower - t0.lower;

     // If necessary, "borrow" from upper word.
     if (t1.lower < t0.lower)

     return result;

/*    Convert clock value to double, without changing units.  That is, both
     input and the return value are the same number of clock ticks.
static double ClockToDouble(const ClockValue t)
     return t.upper * 4294967296. + t.lower;

#include <CHUD/CHUD.h>

/*    CheckCHUDStatus.

     If CHUD status is not success, print an error.
static void CheckCHUDStatus(int status, const char *routine)
     if (status != chudSuccess)
             "Error, %s returned %d.\nCHUD status string is \"%s\".\n",
             routine, status, chudGetStatusStr());

/*    RequireCHUDStatus.

     If CHUD status is not success, print an error and exit.
static void RequireCHUDStatus(int status, const char *routine)
     CheckCHUDStatus(status, routine);
     if (status != chudSuccess)

/*    StopClockServices.

     Release any facilities acquired for clock services.
static void StopClockServices(void)
     int result = chudStopPMCs();
     CheckCHUDStatus(result, "chudStopPMCs");

     result = chudReleaseSamplingFacility(0);
     CheckCHUDStatus(result, "chudReleaseSamplingFacility");

     // Make all processors available again.
     result = chudSetNumProcessors(chudPhysicalProcessorCount());
     CheckCHUDStatus(result, "chudSetNumProcessors");


/*    StartClockServices.

     Initialize the environment for timing routines.  If resources are
     that should be free, an exit handler is set to release them.  (In
     particular, if we set the system to use a single CPU while timing, we
     to reset it afterward to use all CPUs.)
static void StartClockServices(void)
     // Execute this routine only once.
     static bool started = false;
     if (started)
     started = true;

     // Initialize CHUD facilities.

     int result = chudInitialize();
     RequireCHUDStatus(result, "chudInitialize");

     // Restrict system to one processor.
     // (What we really want is to bind this process to one processor.)
     result = chudSetNumProcessors(1);
     RequireCHUDStatus(result, "chudSetNumProcessors");

     result = chudAcquireSamplingFacility(CHUD_NONBLOCKING);
     RequireCHUDStatus(result, "chudAcquireSamplingFacility");

     // On a 970 CPU, PMC1 event 15 is CPU cycles.
     result = chudSetPMCEvent(chudCPU1Dev, PMC_1, 15);
     RequireCHUDStatus(result, "chudSetPMCEvent");

     result = chudSetPMCMode(chudCPU1Dev, PMC_1, chudCounter);
     RequireCHUDStatus(result, "chudSetPMCMode");

     // On a 970 CPU, PMC2 event 10 is PMC1 overflow.
     result = chudSetPMCEvent(chudCPU1Dev, PMC_2, 10);
     RequireCHUDStatus(result, "chudSetPMCEvent");

     result = chudSetPMCMode(chudCPU1Dev, PMC_2, chudCounter);
     RequireCHUDStatus(result, "chudSetPMCMode");

     result = chudClearPMCs();
     RequireCHUDStatus(result, "chudClearPMCs");

     result = chudStartPMCs();
     RequireCHUDStatus(result, "chudStartPMCs");

     // Set exit handler to release resources.
     result = atexit(StopClockServices);
     if (result != 0)
         fprintf(stderr, "Error, atexit returned %d.\n", result);

/*    This routine measures the gross amount of time it takes to execute an
     arbitrary routine, including whatever overhead there is, such as
     the clock value and loading the routine into cache.


         void (*routine)(void *).
             Address of a routine to measure.

         void *data.
             Pointer to be passed to the routine.

         Return value.
             Gross time to execute.
static ClockValue MeasureGrossTime(void (*routine)(void *), void *data)
     ClockValue t0, t1;

     // Get time before executing routine.
     t0 = ReadClock();

     // Execute routine.

     // Get time after executing routine.
     t1 = ReadClock();

     // Return difference in times.
     return SubtractClock(t1, t0);

Do not post admin requests to the list. They will be ignored.
PerfOptimization-dev mailing list      (email@hidden)
Help/Unsubscribe/Update your Subscription:

This email sent to email@hidden

Visit the Apple Store online or at retail locations.

Contact Apple | Terms of Use | Privacy Policy

Copyright © 2011 Apple Inc. All rights reserved.