Re: Extracting Images from scanned PDF document
Re: Extracting Images from scanned PDF document
- Subject: Re: Extracting Images from scanned PDF document
- From: Scott Ribe <email@hidden>
- Date: Thu, 28 Sep 2006 20:30:56 -0600
- Thread-topic: Extracting Images from scanned PDF document
> What I'm looking for is suggested directions to investigate. Is it
> easy to dissect the PDF and get access to its pieces? How would that
> be done? Any pointers would be welcome.
I think this requires diving into Carbon--no Cocoa API that I know of. Well,
that said, if you ASSUME that each page has only a single picture and
nothing else, and you make certain ASSUMPTIONS about naming conventions, and
you ASSUME the images are 1-bit and are deflate-coded, and you ASSUME you're
using a modified version of tifflib to generate output, and you want to
create a multi-page TIFF instead of individual TIFF files, well, then, it
might go something like this:
--------------------------
#include <Carbon/Carbon.h>
#include "tiffio_osx.h"
#include <iostream>
using namespace std;
int main (int argc, char * const argv[])
{
if( argc != 3 )
{
cerr << "pagesender2tiff Copyright 2006 by Scott Ribe" << endl
<< "usage: pagesender2tiff source.pdf dest.tiff" << endl <<
flush;
return 1;
}
CFStringRef path = CFStringCreateWithCString( NULL, argv[1],
kCFStringEncodingUTF8 );
CFURLRef url = CFURLCreateWithFileSystemPath( NULL, path,
kCFURLPOSIXPathStyle, 0 );
CGPDFDocumentRef doc = CGPDFDocumentCreateWithURL( url );
if( !doc )
{
cerr << "could not open source pdf file" << endl << flush;
return 1;
}
int pgcnt = CGPDFDocumentGetNumberOfPages( doc );
if( pgcnt <= 0 )
{
cerr << "source pdf file has no pages" << endl << flush;
return 1;
}
TIFF * tif = TIFFOpen( argv[2], "w" );
if( !tif )
{
cerr << "failed to open destination tiff file" << endl << flush;
return 1;
}
for( int i1 = 0; i1 < pgcnt; ++i1 )
{
CGPDFPageRef pg = CGPDFDocumentGetPage( doc, i1 + 1 );
if( !pg )
{
cerr << "failed to read page " << i1 + 1 << endl << flush;
return 1;
}
CGPDFDictionaryRef dict = CGPDFPageGetDictionary( pg );
if( !dict )
{
cerr << "failed to read dictionary for page " << i1 + 1 << endl
<< flush;
return 1;
}
CGPDFStreamRef cont;
if( !CGPDFDictionaryGetStream( dict, "Contents", &cont ) )
{
cerr << "failed to read contents stream for page " << i1 + 1 <<
endl << flush;
return 1;
}
CFDataRef contdata = CGPDFStreamCopyData( cont, NULL );
cout << "contents: " << (char *) CFDataGetBytePtr( contdata ) <<
endl << flush;
CGPDFArrayRef media;
if( !CGPDFDictionaryGetArray( dict, "MediaBox", &media ) )
{
cerr << "failed to read media box array for page " << i1 + 1 <<
endl << flush;
return 1;
}
CGPDFInteger mediatop, medialeft;
CGPDFReal mediaright, mediabottom;
if( !CGPDFArrayGetInteger( media, 0, &mediatop ) ||
!CGPDFArrayGetInteger( media, 1, &medialeft ) ||
!CGPDFArrayGetNumber( media, 2, &mediaright ) ||
!CGPDFArrayGetNumber( media, 3, &mediabottom ) )
{
cerr << "failed to read media box values for page " << i1 + 1 <<
endl << flush;
return 1;
}
double mediawidth = mediaright - medialeft, mediaheight =
mediabottom - mediatop;
CGPDFDictionaryRef res;
if( !CGPDFDictionaryGetDictionary( dict, "Resources", &res ) )
{
cerr << "failed to read resources dictionary for page " << i1 +
1 << endl << flush;
return 1;
}
CGPDFDictionaryRef xobj;
if( !CGPDFDictionaryGetDictionary( res, "XObject", &xobj ) )
{
cerr << "failed to read xobject dictionary for page " << i1 + 1
<< endl << flush;
return 1;
}
char imagestr[16];
sprintf( imagestr, "Im%d", i1 + 1 );
CGPDFStreamRef strm;
if( !CGPDFDictionaryGetStream( xobj, imagestr, &strm ) )
{
cerr << "failed to read " << imagestr <<" stream for page " <<
i1 + 1 << endl << flush;
return 1;
}
CGPDFDictionaryRef strmdict = CGPDFStreamGetDictionary( strm );
if( !strmdict )
{
cerr << "failed to read dictionary of " << imagestr << " stream
for page " << i1 + 1 << endl << flush;
return 1;
}
const char * type;
if( !CGPDFDictionaryGetName( strmdict, "Type", &type ) || strcmp(
type, "XObject" ) )
{
cerr << "failed to read Type:XObject of Im1 stream for page " <<
i1 + 1 << endl << flush;
return 1;
}
const char * subtype;
if( !CGPDFDictionaryGetName( strmdict, "Subtype", &subtype ) ||
strcmp( subtype, "Image" ) )
{
cerr << "failed to read Subtype:Image of " << imagestr << "
stream for page " << i1 + 1 << endl << flush;
return 1;
}
CGPDFInteger bitsper;
if( !CGPDFDictionaryGetInteger( strmdict, "BitsPerComponent",
&bitsper ) || bitsper != 1 )
{
cerr << "failed to read BitsPerComponent:1 of " << imagestr << "
stream for page " << i1 + 1 << endl << flush;
return 1;
}
const char * filter;
if( !CGPDFDictionaryGetName( strmdict, "Filter", &filter ) ||
strcmp( filter, "FlateDecode" ) )
{
cerr << "failed to read Filter:FlateDecode of " << imagestr << "
stream for page " << i1 + 1 << endl << flush;
return 1;
}
CGPDFInteger width, height;
if( !CGPDFDictionaryGetInteger( strmdict, "Width", &width ) ||
!CGPDFDictionaryGetInteger( strmdict, "Height", &height ) )
{
cerr << "failed to read width or height of " << imagestr << "
stream for page " << i1 + 1 << endl << flush;
return 1;
}
CGPDFDataFormat fmt = CGPDFDataFormatRaw;
CFDataRef data = CGPDFStreamCopyData( strm, &fmt );
int32_t len = CFDataGetLength( data );
const void * bytes = CFDataGetBytePtr( data );
cout << "image len: " << len << endl << flush;
int32_t rowbytes = (width + 7) / 8;
if( rowbytes * height != len )
{
cerr << "calculated rowbytes (" << rowbytes << ") x height (" <<
height << ") does not match data length (" << len << ")" << endl << flush;
return 1;
}
double xres = width / mediawidth * 72.0, yres = height / mediaheight
* 72.0;
xres = round( xres * 1000 ) / 1000;
yres = round( yres * 1000 ) / 1000;
cout << "xres: " << xres << ", yres: " << yres << endl << flush;
TIFFSetField(tif, TIFFTAG_IMAGEWIDTH, width);
TIFFSetField(tif, TIFFTAG_IMAGELENGTH, height);
TIFFSetField(tif, TIFFTAG_BITSPERSAMPLE, 1);
TIFFSetField(tif, TIFFTAG_SAMPLESPERPIXEL, 1);
TIFFSetField(tif, TIFFTAG_ROWSPERSTRIP, height );
TIFFSetField(tif, TIFFTAG_COMPRESSION, COMPRESSION_CCITTFAX4);
TIFFSetField(tif, TIFFTAG_PHOTOMETRIC, PHOTOMETRIC_MINISWHITE);
TIFFSetField(tif, TIFFTAG_FILLORDER, FILLORDER_MSB2LSB);
TIFFSetField(tif, TIFFTAG_PLANARCONFIG, PLANARCONFIG_CONTIG);
TIFFSetField(tif, TIFFTAG_XRESOLUTION, xres);
TIFFSetField(tif, TIFFTAG_YRESOLUTION, yres);
TIFFSetField(tif, TIFFTAG_RESOLUTIONUNIT, RESUNIT_INCH);
TIFFSetField( tif, TIFFTAG_SOFTWARE, "pagesender2tiff Copyright 2006
by Scott Ribe" );
TIFFWriteEncodedStrip( tif, 0, (void *) bytes, rowbytes * height );
TIFFWriteDirectory( tif );
CFRelease( data );
CGPDFPageRelease( pg );
}
TIFFClose( tif );
return 0;
}
--------------------------
And, just because I feel like earning some bonus points, if you don't know
how your source PDFs are actually structured and are wondering how to figure
out where your PDFs differ from mine, here's how to examine the structure of
a PDF file:
--------------------------
#include <Carbon/Carbon.h>
#include <iostream>
#include <map>
using namespace std;
static const char * sPdfTypeNames[] = { "", "null", "boolean", "integer",
"real", "name", "string", "array", "dictionary", "stream" };
static int level = 1;
void DumpObjectProperties( CGPDFObjectRef obj )
{
int cnt;
CGPDFObjectType type = CGPDFObjectGetType( obj );
switch( type )
{
case kCGPDFObjectTypeBoolean:
{
CGPDFBoolean pdfbool;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeBoolean, &pdfbool
) )
{
if( pdfbool )
cout << " - " << true;
else
cout << " - " << false;
}
}
break;
case kCGPDFObjectTypeInteger:
{
CGPDFInteger pdfint;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeInteger, &pdfint )
)
cout << " - " << pdfint;
}
break;
case kCGPDFObjectTypeReal:
{
CGPDFReal pdfreal;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeReal, &pdfreal ) )
cout << " - " << pdfreal;
}
break;
case kCGPDFObjectTypeName:
{
const char * name;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeName, &name ) )
cout << " - " << name;
}
break;
case kCGPDFObjectTypeString:
{
CGPDFStringRef pdfstr;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeString, &pdfstr )
)
cout << " - " << string( (char *) CGPDFStringGetBytePtr(
pdfstr ), CGPDFStringGetLength( pdfstr ) );
}
break;
case kCGPDFObjectTypeArray:
{
CGPDFArrayRef array;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeArray, &array ) )
{
cnt = CGPDFArrayGetCount( array );
cout << " - " << "entries: " << cnt;
}
}
break;
case kCGPDFObjectTypeDictionary:
{
CGPDFDictionaryRef dict;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeDictionary, &dict
) )
{
cnt = CGPDFDictionaryGetCount( dict );
cout << " - " << "entries: " << cnt;
}
}
break;
}
cout << endl << flush;
}
void DumpObject( const char * key, CGPDFObjectRef obj, void * info )
{
for( int i = 0; i < level; ++i )
cout << "| ";
CGPDFObjectType type = CGPDFObjectGetType( obj );
if( type >= 1 && type < sizeof( sPdfTypeNames ) / sizeof( char *) )
{
cout << key << ": " << sPdfTypeNames[type];
DumpObjectProperties( obj );
}
else
cout << key << ": " << "unrecognized object type " << type << endl
<< flush;
switch( type )
{
case kCGPDFObjectTypeDictionary:
{
if( strcmp( "Parent", key ) )
{
++level;
CGPDFDictionaryRef dict;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeDictionary,
&dict ) )
CGPDFDictionaryApplyFunction( dict, DumpObject, NULL );
--level;
}
}
break;
case kCGPDFObjectTypeArray:
{
++level;
CGPDFArrayRef array;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeArray, &array
) )
{
int arraycnt = CGPDFArrayGetCount( array );
for( int i = 0; i < arraycnt; ++i )
{
CGPDFObjectRef aryobj;
if( CGPDFArrayGetObject( array, i, &aryobj ) )
{
char tmp[16];
sprintf( tmp, "%d", i );
DumpObject( tmp, aryobj, NULL );
}
}
}
--level;
}
break;
case kCGPDFObjectTypeStream:
{
++level;
CGPDFStreamRef strm;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeStream, &strm ) )
{
CGPDFDictionaryRef dict = CGPDFStreamGetDictionary( strm );
if( dict )
CGPDFDictionaryApplyFunction( dict, DumpObject, NULL );
}
--level;
}
break;
}
}
int main (int argc, char * const argv[])
{
if( argc != 2 )
{
cerr << "usage: pdfdir source.pdf" << endl << flush;
return 1;
}
CFStringRef path = CFStringCreateWithCString( NULL, argv[1],
kCFStringEncodingUTF8 );
CFURLRef url = CFURLCreateWithFileSystemPath( NULL, path,
kCFURLPOSIXPathStyle, 0 );
CGPDFDocumentRef doc = CGPDFDocumentCreateWithURL( url );
if( !doc )
{
cerr << "could not open source pdf file" << endl << flush;
return 1;
}
int pgcnt = CGPDFDocumentGetNumberOfPages( doc );
if( pgcnt <= 0 )
{
cerr << "source pdf file has no pages" << endl << flush;
return 1;
}
cout << "page count: " << pgcnt << endl << flush;
for( int i1 = 0; i1 < pgcnt; ++i1 )
{
CGPDFPageRef pg = CGPDFDocumentGetPage( doc, i1 + 1 );
if( !pg )
{
cerr << "failed to read page " << i1 + 1 << " of source pdf
file" << endl << flush;
return 1;
}
CGPDFDictionaryRef dict = CGPDFPageGetDictionary( pg );
if( !dict )
{
cerr << "failed to read dictionary for page " << i1 + 1 << " of
source pdf file" << endl << flush;
return 1;
}
cout << "page: " << i1 + 1 << endl << flush;
CGPDFDictionaryApplyFunction( dict, DumpObject, NULL );
}
return 0;
}
--------------------------
Enjoy ;-)
--
Scott Ribe
email@hidden
http://www.killerbytes.com/
(303) 722-0567 voice
_______________________________________________
Do not post admin requests to the list. They will be ignored.
Cocoa-dev mailing list (email@hidden)
Help/Unsubscribe/Update your Subscription:
This email sent to email@hidden