Re: Parsing PDF
Re: Parsing PDF
- Subject: Re: Parsing PDF
- From: Scott Ribe <email@hidden>
- Date: Sun, 13 Feb 2011 19:43:24 -0700
On Feb 13, 2011, at 5:02 PM, Graham Cox wrote:
> Hi all,
>
> I'm about to embark on a fairly serious bit of coding to extract all the graphic objects from a PDF file to turn them into editable entities in a drawing app.
>
> First, the documentation references some sample code called 'Voyeur' (/Developer/Examples/Quartz/PDF/Voyeur) but this no longer seems to exist. Does anyone have an up to date link?
>
> Second, has anyone already done this and have code they are prepared to share? It must have been reimplemented many times over the years. I'm reading the documentation for what support there is in Cocoa for PDF parsing, and also the PDF spec itself, and it's a pretty big undertaking to get right. I have a distinct feeling I'm staring at a well-solved problem.
Below is skeleton code that will get you started using the CGPDFxxx APIs.
--
Scott Ribe
email@hidden
http://www.elevated-dev.com/
(303) 722-0567 voice
#include <Carbon/Carbon.h>
#include <iostream>
#include <map>
using namespace std;
static const char * sPdfTypeNames[] = { "", "null", "boolean", "integer", "real", "name", "string", "array", "dictionary", "stream" };
static int level = 1;
void DumpObjectProperties( CGPDFObjectRef obj )
{
int cnt;
CGPDFObjectType type = CGPDFObjectGetType( obj );
switch( type )
{
case kCGPDFObjectTypeBoolean:
{
CGPDFBoolean pdfbool;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeBoolean, &pdfbool ) )
{
if( pdfbool )
cout << " - " << true;
else
cout << " - " << false;
}
}
break;
case kCGPDFObjectTypeInteger:
{
CGPDFInteger pdfint;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeInteger, &pdfint ) )
cout << " - " << pdfint;
}
break;
case kCGPDFObjectTypeReal:
{
CGPDFReal pdfreal;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeReal, &pdfreal ) )
cout << " - " << pdfreal;
}
break;
case kCGPDFObjectTypeName:
{
const char * name;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeName, &name ) )
cout << " - " << name;
}
break;
case kCGPDFObjectTypeString:
{
CGPDFStringRef pdfstr;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeString, &pdfstr ) )
cout << " - " << string( (char *) CGPDFStringGetBytePtr( pdfstr ), CGPDFStringGetLength( pdfstr ) );
}
break;
case kCGPDFObjectTypeArray:
{
CGPDFArrayRef array;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeArray, &array ) )
{
cnt = CGPDFArrayGetCount( array );
cout << " - " << "entries: " << cnt;
}
}
break;
case kCGPDFObjectTypeDictionary:
{
CGPDFDictionaryRef dict;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeDictionary, &dict ) )
{
cnt = CGPDFDictionaryGetCount( dict );
cout << " - " << "entries: " << cnt;
}
}
break;
}
cout << endl << flush;
}
void DumpObject( const char * key, CGPDFObjectRef obj, void * info )
{
for( int i = 0; i < level; ++i )
cout << "| ";
CGPDFObjectType type = CGPDFObjectGetType( obj );
if( type >= 1 && type < sizeof( sPdfTypeNames ) / sizeof( char *) )
{
cout << key << ": " << sPdfTypeNames[type];
DumpObjectProperties( obj );
}
else
cout << key << ": " << "unrecognized object type " << type << endl << flush;
switch( type )
{
case kCGPDFObjectTypeDictionary:
{
if( strcmp( "Parent", key ) )
{
++level;
CGPDFDictionaryRef dict;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeDictionary, &dict ) )
CGPDFDictionaryApplyFunction( dict, DumpObject, NULL );
--level;
}
}
break;
case kCGPDFObjectTypeArray:
{
++level;
CGPDFArrayRef array;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeArray, &array ) )
{
int arraycnt = CGPDFArrayGetCount( array );
for( int i = 0; i < arraycnt; ++i )
{
CGPDFObjectRef aryobj;
if( CGPDFArrayGetObject( array, i, &aryobj ) )
{
char tmp[16];
sprintf( tmp, "%d", i );
DumpObject( tmp, aryobj, NULL );
}
}
}
--level;
}
break;
case kCGPDFObjectTypeStream:
{
++level;
CGPDFStreamRef strm;
if( CGPDFObjectGetValue( obj, kCGPDFObjectTypeStream, &strm ) )
{
CGPDFDictionaryRef dict = CGPDFStreamGetDictionary( strm );
if( dict )
CGPDFDictionaryApplyFunction( dict, DumpObject, NULL );
}
--level;
}
break;
}
}
int main (int argc, char * const argv[])
{
if( argc != 2 )
{
cerr << "usage: pdfdir source.pdf" << endl << flush;
return 1;
}
CFStringRef path = CFStringCreateWithCString( NULL, argv[1], kCFStringEncodingUTF8 );
CFURLRef url = CFURLCreateWithFileSystemPath( NULL, path, kCFURLPOSIXPathStyle, 0 );
CGPDFDocumentRef doc = CGPDFDocumentCreateWithURL( url );
if( !doc )
{
cerr << "could not open source pdf file" << endl << flush;
return 1;
}
int pgcnt = CGPDFDocumentGetNumberOfPages( doc );
if( pgcnt <= 0 )
{
cerr << "source pdf file has no pages" << endl << flush;
return 1;
}
cout << "page count: " << pgcnt << endl << flush;
for( int i1 = 0; i1 < pgcnt; ++i1 )
{
CGPDFPageRef pg = CGPDFDocumentGetPage( doc, i1 + 1 );
if( !pg )
{
cerr << "failed to read page " << i1 + 1 << " of source pdf file" << endl << flush;
return 1;
}
CGPDFDictionaryRef dict = CGPDFPageGetDictionary( pg );
if( !dict )
{
cerr << "failed to read dictionary for page " << i1 + 1 << " of source pdf file" << endl << flush;
return 1;
}
cout << "page: " << i1 + 1 << endl << flush;
CGPDFDictionaryApplyFunction( dict, DumpObject, NULL );
}
return 0;
}
_______________________________________________
Cocoa-dev mailing list (email@hidden)
Please do not post admin requests or moderator comments to the list.
Contact the moderators at cocoa-dev-admins(at)lists.apple.com
Help/Unsubscribe/Update your Subscription:
This email sent to email@hidden
References: | |
| >Parsing PDF (From: Graham Cox <email@hidden>) |