[patch] massive libtool speedup for large libs
[patch] massive libtool speedup for large libs
- Subject: [patch] massive libtool speedup for large libs
- From: Scott Wheeler <email@hidden>
- Date: Tue, 13 Feb 2007 16:16:26 +0100
(It was suggested that I send this here instead. An additional note,
discovered after sending the original message is that switching our
debug builds to dwarf reduced the lib size to about 20% of what it was
with stabs.)
Hi folks --
At my office we build some rather large static libraries -- the
debugging builds weigh in at around 700 MB. Since many of our Macs have
a lowly 1 GB of RAM, libtool was really slowing things down.
I pulled the code down from the Open Source site and looked through it
and saw that it's creating a memory buffer as large as the library and
writing to that first and then sending that out to the disk all in one
chunk.
libtool currently seems to allocate roughly 2x the size of the library
(I didn't poke around to see the other half, but I assume it's coming
from the symbols being read from the object files), and with a little
math that puts it up to 1.4 GB of allocated memory for creating the
above library. This puts it well into swap territory. Effectively it
means that a huge buffer is allocated, everything gets swapped out, the
system becomes completely unresponsive while it renders the data out to
a mix of memory and swap (all the while naturally trashing any disk
buffers / cache) and then pages all of that back in to write it out to a
file.
I reworked create_library() to skip the buffer stage and write directly
to the target file. In the case above this knocks out about 60% of the
runtime and leaves the system at least semi-responsive. There are other
places that could be optimized, but the major bottleneck at this point
is just writing the library to disk. I did binary diffs on the output
of the system version and my own and the only different bytes are the
time stamp.
I hope this is the right place to send the patch. I'll probably do the
same thing to ld soon-ish since has the same performance
characteristics, but that patch will be a bit more invasive. (And right
now cctools can't find some of the headers that it wants for building
ld. "streams/streams.h" anyone?)
Cheers,
-Scott
--- libtool.c.orig 2007-02-02 15:19:02.000000000 +0100
+++ libtool.c 2007-02-02 09:07:11.000000000 +0100
@@ -47,6 +47,9 @@
#include "stuff/execute.h"
#include "stuff/version_number.h"
+#include <unistd.h>
+#include <sys/mman.h>
+
#include "make.h"
#include <mach/mach_init.h>
#if defined(__OPENSTEP__) || defined(__GONZO_BUNSEN_BEAKER__)
@@ -1915,6 +1918,14 @@ void)
* archs into the specified output file. Only when more than one architecture
* is in archs will a fat file be created.
*/
+
+static void write_padding(int fd, size_t padding_size)
+{
+ char *padding = calloc(1, padding_size);
+ write(fd, padding, padding_size);
+ free(padding);
+}
+
static
void
create_library(
@@ -1922,10 +1933,8 @@ char *output)
{
unsigned long i, j, k, l, library_size, offset, pad, *time_offsets;
enum byte_sex target_byte_sex;
- char *library, *p;
- kern_return_t r;
struct arch *arch;
- struct fat_header *fat_header;
+ struct fat_header fat_header;
struct fat_arch *fat_arch;
int fd;
#ifndef __OPENSTEP__
@@ -2006,25 +2015,30 @@ char *output)
exit(EXIT_SUCCESS);
/*
- * This buffer is vm_allocate'ed to make sure all holes are filled with
- * zero bytes.
+ * Create the output file. The unlink() is done to handle the problem
+ * when the outputfile is not writable but the directory allows the
+ * file to be removed (since the file may not be there the return code
+ * of the unlink() is ignored).
*/
- if((r = vm_allocate(mach_task_self(), (vm_address_t *)&library,
- library_size, TRUE)) != KERN_SUCCESS)
- mach_fatal(r, "can't vm_allocate() buffer for output file: %s of "
- "size %lu", output, library_size);
+ (void)unlink(output);
+ if((fd = open(output, O_WRONLY | O_CREAT | O_TRUNC, 0666)) == -1){
+ system_error("can't create output file: %s", output);
+ return;
+ }
+
/*
* If there is more than one architecture then fill in the fat file
* header and the fat_arch structures in the buffer.
*/
if(narchs > 1){
- fat_header = (struct fat_header *)library;
- fat_header->magic = FAT_MAGIC;
- fat_header->nfat_arch = narchs;
+ memset(&fat_header, 0, sizeof(struct fat_header));
+ fat_header.magic = FAT_MAGIC;
+ fat_header.nfat_arch = narchs;
offset = sizeof(struct fat_header) +
sizeof(struct fat_arch) * narchs;
- fat_arch = (struct fat_arch *)(library + sizeof(struct fat_header));
+ write(fd, &fat_header, sizeof(struct fat_header));
+ fat_arch = calloc(narchs, sizeof(struct fat_arch));
for(i = 0; i < narchs; i++){
fat_arch[i].cputype = archs[i].arch_flag.cputype;
fat_arch[i].cpusubtype = archs[i].arch_flag.cpusubtype;
@@ -2034,11 +2048,13 @@ char *output)
offset += archs[i].size;
}
#ifdef __LITTLE_ENDIAN__
- swap_fat_header(fat_header, BIG_ENDIAN_BYTE_SEX);
+ swap_fat_header(&fat_header, BIG_ENDIAN_BYTE_SEX);
swap_fat_arch(fat_arch, narchs, BIG_ENDIAN_BYTE_SEX);
#endif /* __LITTLE_ENDIAN__ */
offset = sizeof(struct fat_header) +
sizeof(struct fat_arch) * narchs;
+ write(fd, fat_arch, sizeof(struct fat_arch) * narchs);
+ free(fat_arch);
}
else
offset = 0;
@@ -2052,8 +2068,8 @@ char *output)
/*
* Now put each arch in the buffer.
*/
+
for(i = 0; i < narchs; i++){
- p = library + offset;
arch = archs + i;
/*
@@ -2065,8 +2081,7 @@ char *output)
*/
/* put in the archive magic string */
- memcpy(p, ARMAG, SARMAG);
- p += SARMAG;
+ write(fd, ARMAG, SARMAG);
/*
* Warn for what really is a bad library that has an empty table of
@@ -2105,60 +2120,52 @@ char *output)
* a long for the number of bytes of the strings for the ranlibs
* the strings for the ranlib structs
*/
+
time_offsets[i] =
- (p - library) +
- ((char *)&toc_ar_hdr.ar_date - (char *)&toc_ar_hdr);
- memcpy(p, (char *)&arch->toc_ar_hdr, sizeof(struct ar_hdr));
- p += sizeof(struct ar_hdr);
+ lseek(fd, 0, SEEK_CUR) +
+ ((char *)&toc_ar_hdr.ar_date - (char *)&toc_ar_hdr);
+ write(fd, (char *)&arch->toc_ar_hdr, sizeof(struct ar_hdr));
if(arch->toc_long_name == TRUE){
- memcpy(p, arch->toc_name, arch->toc_name_size);
- p += arch->toc_name_size +
- (round(sizeof(struct ar_hdr), 8) -
- sizeof(struct ar_hdr));
+ write(fd, arch->toc_name, arch->toc_name_size);
+ write_padding(fd, round(sizeof(struct ar_hdr), 8) - sizeof(struct ar_hdr));
}
l = arch->toc_nranlibs * sizeof(struct ranlib);
if(target_byte_sex != host_byte_sex)
l = SWAP_LONG(l);
- memcpy(p, (char *)&l, sizeof(long));
- p += sizeof(long);
+ write(fd, (char *)&l, sizeof(long));
if(target_byte_sex != host_byte_sex)
swap_ranlib(arch->toc_ranlibs, arch->toc_nranlibs,
target_byte_sex);
- memcpy(p, (char *)arch->toc_ranlibs,
- arch->toc_nranlibs * sizeof(struct ranlib));
- p += arch->toc_nranlibs * sizeof(struct ranlib);
+ write(fd, (char *)arch->toc_ranlibs,
+ arch->toc_nranlibs * sizeof(struct ranlib));
l = arch->toc_strsize;
if(target_byte_sex != host_byte_sex)
l = SWAP_LONG(l);
- memcpy(p, (char *)&l, sizeof(long));
- p += sizeof(long);
+ write(fd, (char *)&l, sizeof(long));
- memcpy(p, (char *)arch->toc_strings, arch->toc_strsize);
- p += arch->toc_strsize;
+ write(fd, (char *)arch->toc_strings, arch->toc_strsize);
/*
* Put in the archive header and member contents for each member.
*/
for(j = 0; j < arch->nmembers; j++){
- memcpy(p, (char *)&(arch->members[j].ar_hdr),
- sizeof(struct ar_hdr));
- p += sizeof(struct ar_hdr);
-
+ write(fd, (char *)&(arch->members[j].ar_hdr), sizeof(struct ar_hdr));
/*
* If we are using extended format #1 for long names write out
* the name. Note the name is padded with '\0' and the
* member_name_size is the unrounded size.
*/
if(arch->members[j].output_long_name == TRUE){
- strncpy(p, arch->members[j].member_name,
- arch->members[j].member_name_size);
- p += round(arch->members[j].member_name_size, 8) +
- (round(sizeof(struct ar_hdr), 8) -
- sizeof(struct ar_hdr));
+ write(fd, arch->members[j].member_name, arch->members[j].member_name_size);
+ write_padding(fd,
+ (round(arch->members[j].member_name_size, 8) -
+ arch->members[j].member_name_size) +
+ (round(sizeof(struct ar_hdr), 8) -
+ sizeof(struct ar_hdr)));
}
/*
@@ -2178,33 +2185,16 @@ char *output)
arch->members[j].load_commands) == FALSE)
fatal("internal error: swap_object_headers() failed");
}
- memcpy(p, arch->members[j].object_addr,
- arch->members[j].object_size);
- p += arch->members[j].object_size;
+ write(fd, arch->members[j].object_addr, arch->members[j].object_size);
pad = round(arch->members[j].object_size, 8) -
arch->members[j].object_size;
/* as with the UNIX ar(1) program pad with '\n' characters */
for(k = 0; k < pad; k++)
- *p++ = '\n';
+ write(fd, "\n", 1);
}
offset += arch->size;
}
- /*
- * Create the output file. The unlink() is done to handle the problem
- * when the outputfile is not writable but the directory allows the
- * file to be removed (since the file may not be there the return code
- * of the unlink() is ignored).
- */
- (void)unlink(output);
- if((fd = open(output, O_WRONLY | O_CREAT | O_TRUNC, 0666)) == -1){
- system_error("can't create output file: %s", output);
- return;
- }
- if(write(fd, library, library_size) != (int)library_size){
- system_error("can't write output file: %s", output);
- return;
- }
if(close(fd) == -1){
system_fatal("can't close output file: %s", output);
return;
@@ -2265,11 +2255,6 @@ char *output)
output);
return;
}
- if((r = vm_deallocate(mach_task_self(), (vm_address_t)library,
- library_size)) != KERN_SUCCESS){
- my_mach_error(r, "can't vm_deallocate() buffer for output file");
- return;
- }
}
/*
_______________________________________________
Do not post admin requests to the list. They will be ignored.
Xcode-users mailing list (email@hidden)
Help/Unsubscribe/Update your Subscription:
This email sent to email@hidden