ifixup getting charset

git-svn-id: svn+ssh://svn.gna.org/svn/gnustep/libs/base/trunk@28416 72102866-910b-0410-8b05-ffd578937521
This commit is contained in:
Richard Frith-MacDonald 2009-07-28 18:48:37 +00:00
parent f07bf5fa53
commit 3de3d7bd49

View file

@ -716,164 +716,6 @@ wordData(NSString *word)
return AUTORELEASE([[self alloc] init]);
}
/*
* Examine xml data to find out the characterset needed to convert from
* binary data to an NSString object.
*/
+ (NSString*) charsetForXml: (NSData*)xml
{
unsigned int length = [xml length];
const unsigned char *ptr = (const unsigned char*)[xml bytes];
const unsigned char *end = ptr + length;
unsigned int offset = 0;
unsigned int size = 1;
unsigned char quote = 0;
unsigned char buffer[30];
unsigned int buflen = 0;
BOOL found = NO;
if (length < 4)
{
// Not long enough to determine an encoding
return nil;
}
/*
* Determine encoding using byte-order-mark if present
*/
if ((ptr[0] == 0xFE && ptr[1] == 0xFF)
|| (ptr[0] == 0xFF && ptr[1] == 0xFE))
{
return @"utf-16";
}
if (ptr[0] == 0xEF && ptr[1] == 0xBB && ptr[2] == 0xBF)
{
return @"utf-8";
}
if ((ptr[0] == 0x00 && ptr[1] == 0x00)
&& ((ptr[2] == 0xFE && ptr[3] == 0xFF)
|| (ptr[2] == 0xFF && ptr[3] == 0xFE)))
{
return @"ucs-4";
}
/*
* Look for nul bytes to determine whether this is a four byte
* encoding or a two byte encoding (or the default).
*/
if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0)
{
offset = 3;
size = 4;
}
else if (ptr[0] == 0 && ptr[1] == 0 && ptr[3] == 0)
{
offset = 2;
size = 4;
}
else if (ptr[0] == 0 && ptr[2] == 0 && ptr[3] == 0)
{
offset = 1;
size = 4;
}
else if (ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0)
{
offset = 0;
size = 4;
}
else if (ptr[0] == 0)
{
offset = 1;
size = 2;
}
else if (ptr[1] == 0)
{
offset = 0;
size = 2;
}
/*
* Now look for the xml encoding declaration ...
*/
// Tolerate leading whitespace
while (ptr + size <= end && isspace(ptr[offset])) ptr += size;
if (ptr + (size * 20) >= end || ptr[offset] != '<' || ptr[offset+size] != '?')
{
if (size == 1)
{
return @"utf-8";
}
else if (size == 2)
{
return @"utf-16";
}
else
{
return @"ucs-4";
}
}
ptr += size * 5; // Step past '<?xml' prefix
while (ptr + size <= end)
{
unsigned char c = ptr[offset];
ptr += size;
if (quote == 0)
{
if (c == '\'' || c == '"')
{
buflen = 0;
quote = c;
}
else
{
if (isspace(c) || c == '=')
{
if (buflen == 8)
{
buffer[8] = '\0';
if (strcasecmp((char*)buffer, "encoding") == 0)
{
found = YES;
}
}
buflen = 0;
}
else
{
if (buflen == sizeof(buffer)) buflen = 0;
buffer[buflen++] = c;
}
}
}
else if (c == quote)
{
if (found == YES)
{
NSString *tmp;
tmp = [[NSString alloc] initWithBytes: buffer
length: buflen
encoding: NSASCIIStringEncoding];
IF_NO_GC([tmp autorelease];)
return [tmp lowercaseString];
}
buflen = 0;
quote = 0; // End of quoted section
}
else
{
if (buflen == sizeof(buffer)) buflen = 0;
buffer[buflen++] = c;
}
}
return @"utf-8";
}
/**
* Return a coding context object to be used for decoding data
* according to the scheme specified in the header.
@ -3705,6 +3547,164 @@ appendString(NSMutableData *m, unsigned offset, unsigned fold,
*/
@implementation GSMimeDocument
/*
* Examine xml data to find out the characterset needed to convert from
* binary data to an NSString object.
*/
+ (NSString*) charsetForXml: (NSData*)xml
{
unsigned int length = [xml length];
const unsigned char *ptr = (const unsigned char*)[xml bytes];
const unsigned char *end = ptr + length;
unsigned int offset = 0;
unsigned int size = 1;
unsigned char quote = 0;
unsigned char buffer[30];
unsigned int buflen = 0;
BOOL found = NO;
if (length < 4)
{
// Not long enough to determine an encoding
return nil;
}
/*
* Determine encoding using byte-order-mark if present
*/
if ((ptr[0] == 0xFE && ptr[1] == 0xFF)
|| (ptr[0] == 0xFF && ptr[1] == 0xFE))
{
return @"utf-16";
}
if (ptr[0] == 0xEF && ptr[1] == 0xBB && ptr[2] == 0xBF)
{
return @"utf-8";
}
if ((ptr[0] == 0x00 && ptr[1] == 0x00)
&& ((ptr[2] == 0xFE && ptr[3] == 0xFF)
|| (ptr[2] == 0xFF && ptr[3] == 0xFE)))
{
return @"ucs-4";
}
/*
* Look for nul bytes to determine whether this is a four byte
* encoding or a two byte encoding (or the default).
*/
if (ptr[0] == 0 && ptr[1] == 0 && ptr[2] == 0)
{
offset = 3;
size = 4;
}
else if (ptr[0] == 0 && ptr[1] == 0 && ptr[3] == 0)
{
offset = 2;
size = 4;
}
else if (ptr[0] == 0 && ptr[2] == 0 && ptr[3] == 0)
{
offset = 1;
size = 4;
}
else if (ptr[1] == 0 && ptr[2] == 0 && ptr[3] == 0)
{
offset = 0;
size = 4;
}
else if (ptr[0] == 0)
{
offset = 1;
size = 2;
}
else if (ptr[1] == 0)
{
offset = 0;
size = 2;
}
/*
* Now look for the xml encoding declaration ...
*/
// Tolerate leading whitespace
while (ptr + size <= end && isspace(ptr[offset])) ptr += size;
if (ptr + (size * 20) >= end || ptr[offset] != '<' || ptr[offset+size] != '?')
{
if (size == 1)
{
return @"utf-8";
}
else if (size == 2)
{
return @"utf-16";
}
else
{
return @"ucs-4";
}
}
ptr += size * 5; // Step past '<?xml' prefix
while (ptr + size <= end)
{
unsigned char c = ptr[offset];
ptr += size;
if (quote == 0)
{
if (c == '\'' || c == '"')
{
buflen = 0;
quote = c;
}
else
{
if (isspace(c) || c == '=')
{
if (buflen == 8)
{
buffer[8] = '\0';
if (strcasecmp((char*)buffer, "encoding") == 0)
{
found = YES;
}
}
buflen = 0;
}
else
{
if (buflen == sizeof(buffer)) buflen = 0;
buffer[buflen++] = c;
}
}
}
else if (c == quote)
{
if (found == YES)
{
NSString *tmp;
tmp = [[NSString alloc] initWithBytes: buffer
length: buflen
encoding: NSASCIIStringEncoding];
IF_NO_GC([tmp autorelease];)
return [tmp lowercaseString];
}
buflen = 0;
quote = 0; // End of quoted section
}
else
{
if (buflen == sizeof(buffer)) buflen = 0;
buffer[buflen++] = c;
}
}
return @"utf-8";
}
/**
* Return the MIME characterset name corresponding to the
* specified string encoding.<br />