Mime parsing improvements.

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@8141 72102866-910b-0410-8b05-ffd578937521
This commit is contained in:
Richard Frith-MacDonald 2000-11-17 06:55:58 +00:00
parent 3744751390
commit 13a505f5ad
9 changed files with 468 additions and 117 deletions

View file

@ -1,3 +1,12 @@
2000-11-17 Richard Frith-Macdonald <rfm@gnu.org>
* Headers/Foundation/GSMime.h: Added GSMimeEncodingContext etc
* Source/GSMime.m: Update to add method for general decoding of
different transfer encoding types including chunked (for http1.1).
* Documentation/gsdoc/GSMime.gsdoc: updated
* Documentation/gsdoc/GSMimeDocument.gsdoc: updated
* Documentation/gsdoc/GSMimeParser.gsdoc: updated
2000-11-16 Richard Frith-Macdonald <rfm@gnu.org>
* Source/NSUserDefaults.m: ([-userLanguages]) updated to ensure we

View file

@ -7,8 +7,8 @@
<email address="rfm@gnu.org"/>
<url url="http://www.gnustep.org/developers/whoiswho.html"/>
</author>
<version>0.2</version>
<date>16 November, 2000</date>
<version>0.3</version>
<date>17 November, 2000</date>
</head>
<body>
<chapter>
@ -18,6 +18,27 @@
for representing MIME (and HTTP) documents and managing conversions
to and from convenient internal formats.
</p>
<p>
Eventually the goal is to center round three classes -
</p>
<deflist>
<term>document</term>
<desc>
A container for the actual data (and headers) of a mime/http document.
</desc>
<term>parser</term>
<desc>
An object that can be fed data and will parse it into a document.
This object also provides various utility methods and an API
that permits overriding in order to extend the functionality to
cope with new document types.
</desc>
<term>unparser</term>
<desc>
An object to take a mime/http document and produce a data object
suitable for transmission.
</desc>
</deflist>
<section>
<heading>The classes</heading>
<list>

View file

@ -9,8 +9,8 @@
<dt><a href ="http://www.gnustep.org/developers/whoiswho.html">Richard Frith-Macdonald</a>
<dd>
</dl>
<p>Version: 0.2</p>
<p>Date: 16 November, 2000</p>
<p>Version: 0.3</p>
<p>Date: 17 November, 2000</p>
<h2><a name ="cont-0">Mime Parser</a></h2>
<p>
@ -18,6 +18,28 @@
for representing MIME (and HTTP) documents and managing conversions
to and from convenient internal formats.
</p>
<p>
Eventually the goal is to center round three classes -
</p>
<dl>
<dt>document
<dd>
A container for the actual data (and headers) of a mime/http document.
<dt>parser
<dd>
An object that can be fed data and will parse it into a document.
This object also provides various utility methods and an API
that permits overriding in order to extend the functionality to
cope with new document types.
<dt>unparser
<dd>
An object to take a mime/http document and produce a data object
suitable for transmission.
</dl>
<h3><a name ="cont-1">The classes</a></h3>
<ul>
<li><a href ="GSMimeDocument.html">GSMimeDocument</a>

View file

@ -48,6 +48,9 @@
</desc>
<term>Value</term>
<desc>This is the value of the header (normally lower case).
It may only be a small subset of the information in the header
with other information being split into separate fields
depending on the type of header.
</desc>
</deflist>
<p>

View file

@ -57,6 +57,9 @@
<dt>Value
<dd>This is the value of the header (normally lower case).
It may only be a small subset of the information in the header
with other information being split into separate fields
depending on the type of header.
</dl>

View file

@ -19,9 +19,8 @@
<desc>
<p>
This class provides support for parsing MIME messages
into GSMimeDocument objects. It is imtimately related
to the GSMimeDocument class and relys on that class for
aspects of the parsing operation.
into GSMimeDocument objects. Each parser object maintains
an associated document into which data is stored.
</p>
</desc>
@ -101,6 +100,24 @@
subclasses override <code>scanHeaders:named:into:</code> to
implement custom scanning.
</p>
<p>
As a special case, for HTTP support, this method also parses
lines in the format of HTTP responses as if they were headers
named <code>http</code>. The resulting header info dictionary
contains -
</p>
<deflist>
<term>HttpVersion</term>
<desc>The full HTTP protocol version number</desc>
<term>HttpMajorVersion</term>
<desc>The first part of the version number</desc>
<term>HttpMinorVersion</term>
<desc>The second part of the version number</desc>
<term>HttpStatus</term>
<desc>The HTTP status code</desc>
<term>Value</term>
<desc>The text message (if any) after the status code</desc>
</deflist>
</desc>
</method>
@ -110,7 +127,7 @@
Returns YES if the parser is expecting to read mime headers,
Returns NO is the parser has already been passed all the
data containing headers, and is now waiting for the body of
trhe mime message (or has been passed all data).
the mime message (or has been passed all data).
</desc>
</method>
@ -140,6 +157,89 @@
You should not call this method directly yourself, but may
override it to support parsing of new headers.
</p>
<p>
You should be aware of the parsing that the standard
implementation performs, and that <em>needs</em> to be
done for certain headers in order to permit the parser to
work generally -
</p>
<deflist>
<term>content-disposition</term>
<desc>
<deflist>
<term>Parameters</term>
<desc>
A dictionary containing parameters as key-value pairs
in lowercase
</desc>
<term>Value</term>
<desc>
The content disposition (excluding parameters) as a
lowercase string.
</desc>
</deflist>
</desc>
<term>content-type</term>
<desc>
<deflist>
<term>Parameters</term>
<desc>
A dictionary containing parameters as key-value pairs
in lowercase.
</desc>
<term>SubType</term>
<desc>The MIME subtype lowercase</desc>
<term>Type</term>
<desc>The MIME type lowercase</desc>
<term>value</term>
<desc>The full MIME type (xxx/yyy) in lowercase</desc>
</deflist>
</desc>
<term>content-transfer-encoding</term>
<desc>
<deflist>
<term>Value</term>
<desc>The transfer encoding type in lowercase</desc>
</deflist>
</desc>
<term>http</term>
<desc>
<deflist>
<term>HttpVersion</term>
<desc>The HTTP protocol version number</desc>
<term>HttpMajorVersion</term>
<desc>The first component of the version number</desc>
<term>HttpMinorVersion</term>
<desc>The second component of the version number</desc>
<term>HttpStatus</term>
<desc>The response status value (numeric code)</desc>
<term>Value</term>
<desc>The text message (if any)</desc>
</deflist>
</desc>
<term>transfer-encoding</term>
<desc>
<deflist>
<term>Value</term>
<desc>The transfer encoding type in lowercase</desc>
</deflist>
</desc>
</deflist>
</desc>
</method>
<method type="BOOL">
<sel>scanSpace:</sel>
<arg type="NSScanner*">aScanner</arg>
<desc>
A convenience method to scan past any whitespace in the scanner
in preparation for scanning something more interesting that
comes after it. Returns YES if any space was read, NO otherwise.
</desc>
</method>

View file

@ -23,9 +23,8 @@
<p>
This class provides support for parsing MIME messages
into GSMimeDocument objects. It is imtimately related
to the GSMimeDocument class and relys on that class for
aspects of the parsing operation.
into GSMimeDocument objects. Each parser object maintains
an associated document into which data is stored.
</p>
@ -41,8 +40,9 @@
<li><a href ="GSMimeParser.html#method-4">-parseHeader:</a>
<li><a href ="GSMimeParser.html#method-5">-parsingHeaders</a>
<li><a href ="GSMimeParser.html#method-6">-scanHeader:named:inTo:</a>
<li><a href ="GSMimeParser.html#method-7">-scanSpecial:</a>
<li><a href ="GSMimeParser.html#method-8">-scanToken:</a>
<li><a href ="GSMimeParser.html#method-7">-scanSpace:</a>
<li><a href ="GSMimeParser.html#method-8">-scanSpecial:</a>
<li><a href ="GSMimeParser.html#method-9">-scanToken:</a>
</ul>
<hr><h2>Class Methods </h2>
<h3><a name ="method-0">mimeParser</a></h3>
@ -121,6 +121,27 @@
implement custom scanning.
</p>
<p>
As a special case, for HTTP support, this method also parses
lines in the format of HTTP responses as if they were headers
named <code>http</code>. The resulting header info dictionary
contains -
</p>
<dl>
<dt>HttpVersion
<dd>The full HTTP protocol version number
<dt>HttpMajorVersion
<dd>The first part of the version number
<dt>HttpMinorVersion
<dd>The second part of the version number
<dt>HttpStatus
<dd>The HTTP status code
<dt>Value
<dd>The text message (if any) after the status code
</dl>
<hr>
<h3><a name ="method-5">parsingHeaders</a></h3>
@ -129,7 +150,7 @@
Returns YES if the parser is expecting to read mime headers,
Returns NO is the parser has already been passed all the
data containing headers, and is now waiting for the body of
trhe mime message (or has been passed all data).
the mime message (or has been passed all data).
<hr>
<h3><a name ="method-6">scanHeader:named:inTo:</a></h3>
@ -161,9 +182,93 @@
override it to support parsing of new headers.
</p>
<p>
You should be aware of the parsing that the standard
implementation performs, and that <em>needs</em> to be
done for certain headers in order to permit the parser to
work generally -
</p>
<dl>
<dt>content-disposition
<dd>
<dl>
<dt>Parameters
<dd>
A dictionary containing parameters as key-value pairs
in lowercase
<dt>Value
<dd>
The content disposition (excluding parameters) as a
lowercase string.
</dl>
<dt>content-type
<dd>
<dl>
<dt>Parameters
<dd>
A dictionary containing parameters as key-value pairs
in lowercase.
<dt>SubType
<dd>The MIME subtype lowercase
<dt>Type
<dd>The MIME type lowercase
<dt>value
<dd>The full MIME type (xxx/yyy) in lowercase
</dl>
<dt>content-transfer-encoding
<dd>
<dl>
<dt>Value
<dd>The transfer encoding type in lowercase
</dl>
<dt>http
<dd>
<dl>
<dt>HttpVersion
<dd>The HTTP protocol version number
<dt>HttpMajorVersion
<dd>The first component of the version number
<dt>HttpMinorVersion
<dd>The second component of the version number
<dt>HttpStatus
<dd>The response status value (numeric code)
<dt>Value
<dd>The text message (if any)
</dl>
<dt>transfer-encoding
<dd>
<dl>
<dt>Value
<dd>The transfer encoding type in lowercase
</dl>
</dl>
<hr>
<h3><a name ="method-7">scanSpecial:</a></h3>
<h3><a name ="method-7">scanSpace:</a></h3>
- (BOOL) <b>scanSpace:</b> (NSScanner*)aScanner;<br>
A convenience method to scan past any whitespace in the scanner
in preparation for scanning something more interesting that
comes after it. Returns YES if any space was read, NO otherwise.
<hr>
<h3><a name ="method-8">scanSpecial:</a></h3>
- (NSString*) <b>scanSpecial:</b> (NSScanner*)aScanner;<br>
A convenience method to use a scanner (that is set up to scan a
@ -173,7 +278,7 @@
will contain a single space character.
<hr>
<h3><a name ="method-8">scanToken:</a></h3>
<h3><a name ="method-9">scanToken:</a></h3>
- (NSString*) <b>scanToken:</b> (NSScanner*)aScanner;<br>
A convenience method to use a scanner (that is set up to scan a

View file

@ -58,6 +58,7 @@ typedef enum {
GSMimeEncoding type; /* The encoding type to be used. */
unsigned char buf[8]; /* Temporary data storage area. */
int pos; /* Context position count. */
BOOL foot; /* Reading footer near end of data. */
BOOL atEnd; /* Flag to say that data has ended. */
}
@end
@ -111,6 +112,7 @@ typedef enum {
- (BOOL) scanHeader: (NSScanner*)aScanner
named: (NSString*)headerName
inTo: (NSMutableDictionary*)info;
- (BOOL) scanPastSpace: (NSScanner*)aScanner;
- (NSString*) scanSpecial: (NSScanner*)aScanner;
- (NSString*) scanToken: (NSScanner*)aScanner;

View file

@ -194,7 +194,7 @@ parseCharacterSet(NSString *token)
@interface GSMimeParser (Private)
- (BOOL) _decodeBody;
- (BOOL) _decodeBody: (NSData*)data;
- (NSString*) _decodeHeader;
- (BOOL) _unfoldHeader;
@end
@ -330,7 +330,7 @@ parseCharacterSet(NSString *token)
decodebase64(dst, ctxt->buf);
size += len;
}
[dData setLength: dst - beg];
[dData setLength: size + dst - beg];
break;
case GSMimeEncodingQuotedPrintable:
@ -377,12 +377,36 @@ parseCharacterSet(NSString *token)
}
src++;
}
[dData setLength: dst - beg];
[dData setLength: size + dst - beg];
break;
case GSMimeEncodingChunked:
while (ctxt->atEnd == NO && src < end)
{
/*
* If we are reading a chunk footer, look for a blank line
* that terminates it.
*/
if (ctxt->foot == YES)
{
if (*src == '\r')
{
src++;
}
else if (*src != '\n' || ctxt->buf[0] != '\n')
{
ctxt->buf[0] = *src++;
}
else
{
ctxt->foot = NO;
ctxt->atEnd = YES;
src++;
break;
}
continue;
}
/*
* Keep track of chunk size in the context.
* A negative 'pos' indicates that we are reading the chunk size.
@ -453,7 +477,8 @@ parseCharacterSet(NSString *token)
*/
if (ctxt->pos == 0)
{
ctxt->atEnd = YES;
ctxt->foot = YES;
ctxt->buf[0] = src[-1]; // last char read
}
}
else
@ -482,7 +507,7 @@ parseCharacterSet(NSString *token)
}
src++;
}
[dData setLength: dst - beg];
[dData setLength: size + dst - beg];
}
}
break;
@ -495,8 +520,7 @@ parseCharacterSet(NSString *token)
[dData setLength: size + (end - src)];
dst = (unsigned char*)[dData mutableBytes];
memcpy(&dst[size], src, (end - src));
size += (end - src);
[dData setLength: size];
[dData setLength: size + end - src];
break;
}
@ -537,38 +561,45 @@ parseCharacterSet(NSString *token)
}
if ([d length] > 0)
{
[data appendBytes: [d bytes] length: [d length]];
bytes = (unsigned char*)[data mutableBytes];
dataEnd = [data length];
while (inBody == NO)
if (inBody == NO)
{
if ([self _unfoldHeader] == NO)
{
return YES; /* Needs more data to fill line. */
}
if (inBody == NO)
{
NSString *header;
[data appendBytes: [d bytes] length: [d length]];
bytes = (unsigned char*)[data mutableBytes];
dataEnd = [data length];
header = [self _decodeHeader];
if (header == nil)
while (inBody == NO)
{
if ([self _unfoldHeader] == NO)
{
return NO; /* Couldn't handle word encodings. */
return YES; /* Needs more data to fill line. */
}
if ([self parseHeader: header] == NO)
if (inBody == NO)
{
return NO; /* Header was not parsed properly. */
NSString *header;
header = [self _decodeHeader];
if (header == nil)
{
return NO; /* Couldn't handle words. */
}
if ([self parseHeader: header] == NO)
{
return NO; /* Header not parsed properly. */
}
}
}
/*
* All headers have been parsed, so we empty our internal buffer
* (which we will now use to store decoded data) and place unused
* information back in the incoming data object to act as input.
*/
d = AUTORELEASE([data copy]);
[data setLength: 0];
}
/*
* If we have a multipart document, we must feed the data to
* a child parser to decode the subsidiary parts.
*/
if (boundary != nil)
if ([d length] > 0)
{
[self _decodeBody];
[self _decodeBody: d];
}
return YES; /* Want more data for body */
}
@ -578,7 +609,7 @@ parseCharacterSet(NSString *token)
if (inBody == YES)
{
result = [self _decodeBody];
result = [self _decodeBody: d];
}
else
{
@ -609,27 +640,35 @@ parseCharacterSet(NSString *token)
*/
[info setObject: [scanner string] forKey: @"RawHeader"];
/*
* Special case - permit web response status line to act like a header.
*/
if ([scanner scanString: @"HTTP" intoString: &name] == NO
|| [scanner scanString: @"/" intoString: 0] == NO)
{
if ([scanner scanUpToString: @":" intoString: &name] == NO)
{
NSLog(@"Not a valid header (%@)", [scanner string]);
return NO;
}
/*
* Position scanner after colon and any white space.
*/
if ([scanner scanString: @":" intoString: 0] == NO)
{
NSLog(@"No colon terminating name in header (%@)", [scanner string]);
return NO;
}
}
/*
* Store the Raw header name and a lowercase version too.
*/
if ([scanner scanUpToString: @":" intoString: &name] == NO)
{
NSLog(@"No colon terminated name in header (%@)", [scanner string]);
return NO;
}
name = [name stringByTrimmingTailSpaces];
[info setObject: name forKey: @"BaseName"];
name = [name lowercaseString];
[info setObject: name forKey: @"Name"];
/*
* Position scanner after colon and any white space.
*/
if ([scanner scanString: @":" intoString: 0] == NO)
{
NSLog(@"No colon terminating name in header (%@)", [scanner string]);
return NO;
}
skip = RETAIN([scanner charactersToBeSkipped]);
[scanner setCharactersToBeSkipped: nil];
[scanner scanCharactersFromSet: skip intoString: 0];
@ -658,7 +697,7 @@ parseCharacterSet(NSString *token)
int majv = 0;
int minv = 0;
value = [info objectForKey: @"Value"];
value = [info objectForKey: @"BaseValue"];
if ([value length] == 0)
{
NSLog(@"Missing value for mime-version header");
@ -819,16 +858,45 @@ parseCharacterSet(NSString *token)
/*
* Now see if we are interested in any of it.
*/
if ([name isEqualToString: @"mime-version"] == YES)
if ([name isEqualToString: @"http"] == YES)
{
value = [self scanToken: scanner];
if ([value length] == 0)
int major;
int minor;
int status;
if ([scanner scanInt: &major] == NO || major < 0)
{
NSLog(@"Bad value for mime-version header");
NSLog(@"Bad value for http major version");
return NO;
}
if ([scanner scanString: @"." intoString: 0] == NO)
{
NSLog(@"Bad format for http version");
return NO;
}
if ([scanner scanInt: &minor] == NO || minor < 0)
{
NSLog(@"Bad value for http minor version");
return NO;
}
if ([scanner scanInt: &status] == NO || status < 0)
{
NSLog(@"Bad value for http status");
return NO;
}
[info setObject: [NSString stringWithFormat: @"%d", major]
forKey: @"HttpMajorVersion"];
[info setObject: [NSString stringWithFormat: @"%d", minor]
forKey: @"HttpMinorVersion"];
[info setObject: [NSString stringWithFormat: @"%d.%d", major, minor]
forKey: @"HttpVersion"];
[info setObject: [NSString stringWithFormat: @"%d", status]
forKey: @"HttpStatus"];
[self scanPastSpace: scanner];
value = [[scanner string] substringFromIndex: [scanner scanLocation]];
}
else if ([name isEqualToString: @"content-transfer-encoding"] == YES)
else if ([name isEqualToString: @"content-transfer-encoding"] == YES
|| [name isEqualToString: @"transfer-encoding"] == YES)
{
value = [self scanToken: scanner];
if ([value length] == 0)
@ -966,20 +1034,25 @@ parseCharacterSet(NSString *token)
return YES;
}
- (NSString*) scanSpecial: (NSScanner*)scanner
- (BOOL) scanPastSpace: (NSScanner*)scanner
{
NSCharacterSet *skip;
BOOL scanned;
skip = RETAIN([scanner charactersToBeSkipped]);
[scanner setCharactersToBeSkipped: nil];
scanned = [scanner scanCharactersFromSet: skip intoString: 0];
[scanner setCharactersToBeSkipped: skip];
RELEASE(skip);
return scanned;
}
- (NSString*) scanSpecial: (NSScanner*)scanner
{
unsigned location;
unichar c;
/*
* Move past white space.
*/
skip = RETAIN([scanner charactersToBeSkipped]);
[scanner setCharactersToBeSkipped: nil];
[scanner scanCharactersFromSet: skip intoString: 0];
[scanner setCharactersToBeSkipped: skip];
RELEASE(skip);
[self scanPastSpace: scanner];
/*
* Now return token delimiter (may be whitespace)
@ -1231,7 +1304,7 @@ parseCharacterSet(NSString *token)
return hdr;
}
- (BOOL) _decodeBody
- (BOOL) _decodeBody: (NSData*)d
{
if (boundary == nil)
{
@ -1247,52 +1320,51 @@ parseCharacterSet(NSString *token)
}
else
{
unsigned length = [data length];
NSMutableData *decoded = [NSMutableData dataWithCapacity: length];
if (context->atEnd == YES)
{
if ([d length] > 0)
{
NSLog(@"Additional data ignored after parse complete");
}
return YES; /* Nothing more to do */
}
[self decodeData: data
fromRange: NSMakeRange(0, length)
intoData: decoded
[self decodeData: d
fromRange: NSMakeRange(0, [d length])
intoData: data
withContext: context];
if (context->pos != 0)
{
context->atEnd = YES;
[self decodeData: nil
fromRange: NSMakeRange(0, 0)
intoData: decoded
withContext: context];
}
/*
* If no content type is supplied, we assume text.
*/
if (type == nil || [type isEqualToString: @"text"] == YES)
{
NSDictionary *params;
NSString *charset;
NSStringEncoding stringEncoding;
NSString *string;
/*
* Assume that content type is best represented as NSString.
*/
params = [typeInfo objectForKey: @"Parameters"];
charset = [params objectForKey: @"charset"];
stringEncoding = parseCharacterSet(charset);
string = [[NSString alloc] initWithData: decoded
encoding: stringEncoding];
[document setContent: string];
RELEASE(string);
}
else
if (context->atEnd == YES)
{
/*
* Assume that any non-text content type is best
* represented as NSData.
* If no content type is supplied, we assume text.
*/
decoded = [decoded copy]; /* Ensure it's immutable */
[document setContent: decoded];
RELEASE(decoded);
if (type == nil || [type isEqualToString: @"text"] == YES)
{
NSDictionary *params;
NSString *charset;
NSStringEncoding stringEncoding;
NSString *string;
/*
* Assume that content type is best represented as NSString.
*/
params = [typeInfo objectForKey: @"Parameters"];
charset = [params objectForKey: @"charset"];
stringEncoding = parseCharacterSet(charset);
string = [[NSString alloc] initWithData: data
encoding: stringEncoding];
[document setContent: string];
RELEASE(string);
}
else
{
/*
* Assume that any non-text content type is best
* represented as NSData.
*/
[document setContent: AUTORELEASE([data copy])];
}
}
return YES;
}
@ -1304,6 +1376,10 @@ parseCharacterSet(NSString *token)
unsigned char bInit = bBytes[0];
BOOL done = NO;
[data appendBytes: [d bytes] length: [d length]];
bytes = (unsigned char*)[data mutableBytes];
dataEnd = [data length];
while (done == NO)
{
/*
@ -1367,10 +1443,11 @@ parseCharacterSet(NSString *token)
if ([child parse: d] == YES && [child parse: nil] == YES)
{
NSMutableArray *a;
GSMimeDocument *doc;
/*
* Store the document produced by the child, and
* create anew parser for the next section.
* create a new parser for the next section.
*/
a = [document content];
if (a == nil)
@ -1379,7 +1456,11 @@ parseCharacterSet(NSString *token)
[document setContent: a];
RELEASE(a);
}
[a addObject: [child document]];
doc = [child document];
if (doc != nil)
{
[a addObject: doc];
}
RELEASE(child);
child = [GSMimeParser new];
}
@ -1533,6 +1614,11 @@ parseCharacterSet(NSString *token)
return content;
}
- (id) copyWithZone: (NSZone*)z
{
return RETAIN(self);
}
- (void) dealloc
{
RELEASE(headers);