setitimer late in delivering signals
setitimer late in delivering signals
- Subject: setitimer late in delivering signals
- From: "Matthew Fluet" <email@hidden>
- Date: Wed, 8 Aug 2007 10:33:44 -0500
Hi,
I have an application that uses setitimer(ITIMER_PROF,...) and
sigaction(SIGPROF,...) for profiling. On a dual-core/dual-proc Intel
MacPro (MacOSX 10.4.10), I was observing that the profiler would
report 5s of CPU time (based on the number of SIGPROFs handled), but
/usr/bin/time was clearly indicating that the application was using
136s of CPU time.
A little investigating shows that, while setitimer was called with a
10000 us interval, there were a number of occasions where more than
*10 seconds* elapsed between invocations of the SIGPROF handler.
This short program demonstrates the problem:
/* begin test.c */
#include <stdlib.h>
#include <stdio.h>
#include <signal.h>
#include <unistd.h>
#include <sys/time.h>
#include <sys/resource.h>
void timevalPlus (struct timeval *tv1, struct timeval *tv2, struct
timeval *tv) {
const long int million = 1000000;
long int sec, usec;
sec = tv1->tv_sec + tv2->tv_sec;
usec = tv1->tv_usec + tv2->tv_usec;
sec += (usec / million);
usec %= million;
tv->tv_sec = sec;
tv->tv_usec = usec;
}
void timevalMinus (struct timeval *tv1, struct timeval *tv2, struct
timeval *tv) {
const long int million = 1000000;
long int sec, usec;
sec = (tv1->tv_sec - tv2->tv_sec) - 1;
usec = tv1->tv_usec + million - tv2->tv_usec;
sec += (usec / million);
usec %= million;
tv->tv_sec = sec;
tv->tv_usec = usec;
}
#define INTERVAL_USEC 10000
static void check() {
struct rusage ru;
struct timeval rusage_time_cur;
struct timeval rusage_time_diff;
static struct timeval rusage_time_last;
static long int count = 0;
static long int bad = 0;
static long int worse = 0;
static long int worst = 0;
if (0 != getrusage (0, &ru)) {
fprintf (stderr, "getrusage failed\n");
exit(1);
}
timevalPlus(&ru.ru_utime, &ru.ru_stime, &rusage_time_cur);
if (count > 0) {
timevalMinus (&rusage_time_cur, &rusage_time_last, &rusage_time_diff);
int show_status = 0;
if (count % 500 == 0) show_status = 1;
if (rusage_time_diff.tv_sec > 0) {
worst++;
show_status = 1;
} else if (rusage_time_diff.tv_usec > 5 * INTERVAL_USEC) {
worse++;
} else if (rusage_time_diff.tv_usec > ((3 * INTERVAL_USEC) / 2)) {
bad++;
}
if (show_status) {
printf ("count = %li, bad = %li, worse = %li, worst = %li\n",
count, bad, worse, worst);
printf ("rusage_time_diff.tv_sec = %li, rusage_time_diff.tv_usec
= %li\n\n",
(long int)rusage_time_diff.tv_sec, (long
int)rusage_time_diff.tv_usec);
}
}
if (count >= 1000) {
printf ("count = %li, bad = %li, worse = %li, worst = %li\n",
count, bad, worse, worst);
printf ("rusage_time_cur.tv_sec = %li, rusage_time_cur.tv_usec = %li\n",
(long int)rusage_time_cur.tv_sec, (long int)rusage_time_cur.tv_usec);
exit (0);
}
count++;
rusage_time_last = rusage_time_cur;
}
static volatile int perform_check;
static void action(int sig, siginfo_t *sip, void* z) {
perform_check = 1;
}
int main(int argc, char* argv[]) {
struct sigaction sa;
struct itimerval iv;
perform_check = 0;
sigemptyset (&sa.sa_mask);
sa.sa_flags = SA_RESTART | SA_SIGINFO;
sa.sa_sigaction = (void (*)(int, siginfo_t*, void*))action;
if (0 != sigaction (SIGPROF, &sa, NULL)) {
fprintf (stderr, "sigaction failed\n");
exit(1);
}
iv.it_interval.tv_sec = 0;
iv.it_interval.tv_usec = INTERVAL_USEC;
iv.it_value.tv_sec = 0;
iv.it_value.tv_usec = INTERVAL_USEC;
if (0 != setitimer (ITIMER_PROF, &iv, NULL)) {
fprintf (stderr, "setitimer failed\n");
exit(1);
}
/* Something to keep the processor busy. */
int num, cycleLen, n;
num = 0;
while (1) {
if (perform_check) { check(); perform_check = 0; }
cycleLen = 0;
n = num;
while (1) {
if (perform_check) { check(); perform_check = 0; }
if (n == 0 || n == 1) {
break;
} else if (n % 2 == 1) {
n = 3 * n + 1;
} else {
n = n / 2;
}
cycleLen++;
}
if (cycleLen > 710) {
printf ("num = %i, cycleLen = %i\n", num, cycleLen);
}
num++;
}
return 0;
}
/* end test.c */
Essentially, the 'check' function determines the elapsed CPU time
between invocations, and records when things are "bad" (elapsed time
is between 15000us and 50000us), "worse" (elapsed time is between
50000us and 1s), and "worst' (elapsed time greater than 1s).
Running on my MacPro, I get output like the following:
/* begin x86-darwin.4cpu.txt */
count = 46, bad = 0, worse = 0, worst = 1
rusage_time_diff.tv_sec = 1, rusage_time_diff.tv_usec = 879932
count = 84, bad = 0, worse = 0, worst = 2
rusage_time_diff.tv_sec = 1, rusage_time_diff.tv_usec = 729972
...
count = 610, bad = 0, worse = 14, worst = 17
rusage_time_diff.tv_sec = 14, rusage_time_diff.tv_usec = 189995
count = 614, bad = 0, worse = 14, worst = 18
rusage_time_diff.tv_sec = 64, rusage_time_diff.tv_usec = 809370
count = 772, bad = 0, worse = 14, worst = 19
rusage_time_diff.tv_sec = 1, rusage_time_diff.tv_usec = 169648
count = 779, bad = 0, worse = 14, worst = 20
rusage_time_diff.tv_sec = 6, rusage_time_diff.tv_usec = 379854
count = 851, bad = 0, worse = 14, worst = 21
rusage_time_diff.tv_sec = 124, rusage_time_diff.tv_usec = 199970
count = 856, bad = 0, worse = 14, worst = 22
rusage_time_diff.tv_sec = 5, rusage_time_diff.tv_usec = 319675
...
count = 994, bad = 0, worse = 18, worst = 28
rusage_time_diff.tv_sec = 2, rusage_time_diff.tv_usec = 209937
count = 1000, bad = 0, worse = 18, worst = 28
rusage_time_diff.tv_sec = 0, rusage_time_diff.tv_usec = 10000
count = 1000, bad = 0, worse = 18, worst = 28
rusage_time_cur.tv_sec = 363, rusage_time_cur.tv_usec = 911990
/* end x86-darwin.4cpu.txt */
which shows, for example, that the 610th signal was delivered 14s
after then 609th, the 614th signal was delivered 64s after the 613th,
and the 851st signal was delivered 124s after then 850th. The final
lines show that program spent a total of 363s waiting for the delivery
of 1000 signals.
Using the CHUD tools, I disabled all but one of the cores on my
MacPro; then I get output like the following:
/* begin x86-darwin.1cpu.txt */
count = 500, bad = 1, worse = 0, worst = 0
rusage_time_diff.tv_sec = 0, rusage_time_diff.tv_usec = 9967
count = 1000, bad = 10, worse = 0, worst = 0
rusage_time_diff.tv_sec = 0, rusage_time_diff.tv_usec = 9984
count = 1000, bad = 10, worse = 0, worst = 0
rusage_time_cur.tv_sec = 10, rusage_time_cur.tv_usec = 31320
/* end x86-darwin.1cpu.txt */
which is much more reasonable -- only 10 signals were delivered
slightly late and the program spent a total of 10s waiting for the
delivery of 1000 signals.
With 2 cores enabled, I get output like the following:
/* begin x86-darwin.2cpu.txt */
count = 500, bad = 4, worse = 19, worst = 0
rusage_time_diff.tv_sec = 0, rusage_time_diff.tv_usec = 10000
count = 833, bad = 5, worse = 35, worst = 1
rusage_time_diff.tv_sec = 1, rusage_time_diff.tv_usec = 9974
count = 1000, bad = 6, worse = 45, worst = 1
rusage_time_diff.tv_sec = 0, rusage_time_diff.tv_usec = 269998
count = 1000, bad = 6, worse = 45, worst = 1
rusage_time_cur.tv_sec = 32, rusage_time_cur.tv_usec = 267562
/* end x86-darwin.2cpu.txt */
which is somewhat better than the 4cpu experiment, but the program is
still spending a total of 32s waiting for the deliver of 1000 signals.
The phenomenon is also present on PowerPC; on a dual-processor PowerPC
MacPro, I get output like the following:
/* begin powerpc-darwin.2cpu.txt */
count = 426, bad = 1, worse = 10, worst = 1
rusage_time_diff.tv_sec = 1, rusage_time_diff.tv_usec = 9804
count = 500, bad = 1, worse = 16, worst = 1
rusage_time_diff.tv_sec = 0, rusage_time_diff.tv_usec = 10000
count = 506, bad = 1, worse = 16, worst = 2
rusage_time_diff.tv_sec = 1, rusage_time_diff.tv_usec = 79375
count = 652, bad = 1, worse = 19, worst = 3
rusage_time_diff.tv_sec = 1, rusage_time_diff.tv_usec = 329479
count = 1000, bad = 1, worse = 32, worst = 3
rusage_time_diff.tv_sec = 0, rusage_time_diff.tv_usec = 9999
count = 1000, bad = 1, worse = 32, worst = 3
rusage_time_cur.tv_sec = 26, rusage_time_cur.tv_usec = 334582
/* end powerpc-darwin.2cpu.txt */
I also observe the late delivery of signals with
ITIMER_VIRTUAL/SIGVTALRM, though (curiously) not with
ITIMER_REAL/SIGALRM. Nonetheless, for my application,
ITIMER_PROF/SIGPROF is really the appropriate sampling mechanism. On
multi-cpu Linux machines (x86 and amd64), with the above program, I
observe no late delivery of signals and the program spends a total of
11s waiting for the delivery of 10000 signals.
While I suppose that setitimer doesn't provide a guarantee on the
timely delivery of signals, 124s on a 10000us interval seems a bit
excessive.
Is this a known problem? Is there a workaround? Is there a better
forum to raise this issue?
Thanks,
-Matthew
_______________________________________________
Do not post admin requests to the list. They will be ignored.
Darwin-dev mailing list (email@hidden)
Help/Unsubscribe/Update your Subscription:
This email sent to email@hidden