Hi all,
I'm trying to port an RTP stack from Linux to Leopard, and have hit the
problem that under OS X the stack saturates the CPU under load testing
much more rapidly than under Linux (running under Parallels on the same
hardware). Specifically, running ever increasing numbers of loopbacked
RTP streams over the loopback interface, OS X only manages ~20% of the
number of streams that Linux can handle before saturating the CPU. Shark
reveals the stack to spend most of its time in the kernel within sendto
and recvfrom (and kqueue for listening for events on sockets).
I think I've isolated the problem to OS X's UDP performance. The
following simple benchmarker bounces a 256 byte packet 100k times
between two UDP sockets via the loopback interface:
-----------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <fcntl.h>
#include <sys/time.h>
#include <sys/errno.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <arpa/inet.h>
/**
* Bounce a UDP packet back and forth between two loopback
* interface sockets as fast as we can.
*/
class LoopbackBenchmarker {
public:
LoopbackBenchmarker() : n(0) {}
void die(const char * msg) {
perror(msg);
exit(-1);
}
struct sockaddr_in makeAddr(const char * hostname, int port) {
struct sockaddr_in addr;
if(!inet_aton(hostname, &addr.sin_addr))
die("invalid address");
addr.sin_family = AF_INET;
addr.sin_port = htons(port);
memset(addr.sin_zero, 0, sizeof(addr.sin_zero));
return addr;
}
int setupSocket(struct sockaddr_in localaddr) {
int fd;
fd = socket(PF_INET, SOCK_DGRAM, 0);
if(fd == -1)
die("couldn't open socket");
if (bind(fd, (const struct sockaddr *) &localaddr,
sizeof(localaddr)))
die("couldn't bind");
return fd;
}
void bouncePacket(int fd) {
// try to read a packet off whichever socket it came in on...
int ret = recv(fd, buffer, 65536, 0);
if (ret <= 0)
fprintf(stderr, "failed to receive a packet\n");
// ...and shove it back to where it came from.
int len = ret;
ret = send(fd, buffer, len, 0);
if (ret == -1)
die("couldn't send");
if (n++ > 100000) exit(0);
// bounce the packet from its new location
this->bouncePacket(fd == fd2 ? fd1 : fd2);
}
virtual void run() {
struct sockaddr_in addr1 = makeAddr("127.0.0.1", 12346);
struct sockaddr_in addr2 = makeAddr("127.0.0.1", 12348);
fd1 = setupSocket(addr1);
fd2 = setupSocket(addr2);
if (connect(fd1, (struct sockaddr *) &addr2, sizeof(struct
sockaddr)))
die("can't connect fd1");
if (connect(fd2, (struct sockaddr *) &addr1, sizeof(struct
sockaddr)))
die("can't connect fd2");
// inject a packet into fd1 to kick things off
for (uint8_t i = 0; i < 255; i++) {
buffer[i] = i;
}
int len = 256;
int ret = send(fd1, buffer, len, 0);
if (ret == -1)
die ("couldn't send");
// get ready to receive the packet on fd2, and start bouncing
this->bouncePacket(fd2);
}
private:
int fd1, fd2;
int n;
uint8_t buffer[65536];
};
int main() {
LoopbackBenchmarker benchmarker;
benchmarker.run();
return(0);
}
-----------------------------------------
...and running it yields:
illyria$ uname -a
Darwin illyria.local 9.1.0 Darwin Kernel Version 9.1.0: Wed Oct 31
17:46:22 PDT 2007; root:xnu-1228.0.2~1/RELEASE_I386 i386
illyria$ g++ -O3 LoopbackBenchmarker.cpp
illyria$ time ./a.out
real 0m2.556s
user 0m0.299s
sys 0m1.570s
Whereas on an oldish Debian sarge installation running under Parallels
on the same hardware:
illyria-debian$ uname -a
Linux illyria-debian 2.4.27-2-386 #1 Wed Aug 17 09:33:35 UTC 2005 i686
GNU/Linux
illyria-debian$ g++ -O3 LoopbackBenchmarker.cpp
illyria-debian$ time ./a.out
real 0m0.218s
user 0m0.050s
sys 0m0.170s
...which seems to imply that OS X is around an order of magnitude slower
than Linux 2.4 when talking UDP :/ Please tell me it ain't so!
I benchmarked a few other socket protocols to see if this behaviour was
specific to UDP:
* UNIX sockets appear ~7 times slower under OS X
* raw IP sockets appear ~5 times slower under OS X
* UDP over IPv6 is roughly the same as over IPv4
I'm hoping that there's something painfully obvious I'm missing in
setting up my sockets, tuning the stack, or perhaps a particularly
pathological case (relative to Linux) when using the loopback interface
under OS X. Or any other reason why this benchmark isn't actually
meaningful.
Any thoughts would be hugely appreciated :)
thanks,
Matthew.