/*
* wordlib.c
* Copyright (C) 1998-2004 A.J. van Os; Released under GNU GPL
*
* Description:
* Deal with the internals of a MS Word file
*/
#include "antiword.h"
static BOOL bOldMacFile = FALSE;
/*
* Common part of the file checking functions
*/
static BOOL
bCheckBytes(FILE *pFile, const UCHAR *aucBytes, size_t tBytes)
{
int iIndex, iChar;
fail(pFile == NULL || aucBytes == NULL || tBytes == 0);
rewind(pFile);
for (iIndex = 0; iIndex < (int)tBytes; iIndex++) {
iChar = getc(pFile);
if (iChar == EOF || iChar != (int)aucBytes[iIndex]) {
NO_DBG_HEX(iChar);
NO_DBG_HEX(aucBytes[iIndex]);
return FALSE;
}
}
return TRUE;
} /* end of bCheckBytes */
/*
* This function checks whether the given file is or is not a "Word for DOS"
* document
*/
BOOL
bIsWordForDosFile(FILE *pFile, long lFilesize)
{
static UCHAR aucBytes[] =
{ 0x31, 0xbe, 0x00, 0x00, 0x00, 0xab }; /* Word for DOS */
DBG_MSG("bIsWordForDosFile");
if (pFile == NULL || lFilesize < 0) {
DBG_MSG("No proper file given");
return FALSE;
}
if (lFilesize < 128) {
DBG_MSG("File too small to be a Word document");
return FALSE;
}
return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
} /* end of bIsWordForDosFile */
/*
* This function checks whether the given file is or is not a file with an
* OLE envelope (That is a document made by Word 6 or later)
*/
static BOOL
bIsWordFileWithOLE(FILE *pFile, long lFilesize)
{
static UCHAR aucBytes[] =
{ 0xd0, 0xcf, 0x11, 0xe0, 0xa1, 0xb1, 0x1a, 0xe1 };
int iTailLen;
if (pFile == NULL || lFilesize < 0) {
DBG_MSG("No proper file given");
return FALSE;
}
if (lFilesize < (long)BIG_BLOCK_SIZE * 3) {
DBG_MSG("This file is too small to be a Word document");
return FALSE;
}
iTailLen = (int)(lFilesize % BIG_BLOCK_SIZE);
switch (iTailLen) {
case 0: /* No tail, as it should be */
break;
case 1:
case 2: /* Filesize mismatch or a buggy email program */
if ((int)(lFilesize % 3) == iTailLen) {
DBG_DEC(lFilesize);
return FALSE;
}
/*
* Ignore extra bytes caused by buggy email programs.
* They have bugs in their base64 encoding or decoding.
* 3 bytes -> 4 ascii chars -> 3 bytes
*/
DBG_MSG("Document with extra bytes");
break;
default: /* Wrong filesize for a Word document */
DBG_DEC(lFilesize);
DBG_DEC(iTailLen);
return FALSE;
}
return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
} /* end of bIsWordFileWithOLE */
/*
* This function checks whether the given file is or is not a RTF document
*/
BOOL
bIsRtfFile(FILE *pFile)
{
static UCHAR aucBytes[] =
{ '{', '\\', 'r', 't', 'f', '1' };
DBG_MSG("bIsRtfFile");
return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
} /* end of bIsRtfFile */
/*
* This function checks whether the given file is or is not a WP document
*/
BOOL
bIsWordPerfectFile(FILE *pFile)
{
static UCHAR aucBytes[] =
{ 0xff, 'W', 'P', 'C' };
DBG_MSG("bIsWordPerfectFile");
return bCheckBytes(pFile, aucBytes, elementsof(aucBytes));
} /* end of bIsWordPerfectFile */
/*
* This function checks whether the given file is or is not a "Win Word 1 or 2"
* document
*/
BOOL
bIsWinWord12File(FILE *pFile, long lFilesize)
{
static UCHAR aucBytes[2][4] = {
{ 0x9b, 0xa5, 0x21, 0x00 }, /* Win Word 1.x */
{ 0xdb, 0xa5, 0x2d, 0x00 }, /* Win Word 2.0 */
};
int iIndex;
DBG_MSG("bIsWinWord12File");
if (pFile == NULL || lFilesize < 0) {
DBG_MSG("No proper file given");
return FALSE;
}
if (lFilesize < 384) {
DBG_MSG("This file is too small to be a Word document");
return FALSE;
}
for (iIndex = 0; iIndex < (int)elementsof(aucBytes); iIndex++) {
if (bCheckBytes(pFile,
aucBytes[iIndex],
elementsof(aucBytes[iIndex]))) {
return TRUE;
}
}
return FALSE;
} /* end of bIsWinWord12File */
/*
* This function checks whether the given file is or is not a "Mac Word 4 or 5"
* document
*/
BOOL
bIsMacWord45File(FILE *pFile)
{
static UCHAR aucBytes[2][6] = {
{ 0xfe, 0x37, 0x00, 0x1c, 0x00, 0x00 }, /* Mac Word 4 */
{ 0xfe, 0x37, 0x00, 0x23, 0x00, 0x00 }, /* Mac Word 5 */
};
int iIndex;
DBG_MSG("bIsMacWord45File");
for (iIndex = 0; iIndex < (int)elementsof(aucBytes); iIndex++) {
if (bCheckBytes(pFile,
aucBytes[iIndex],
elementsof(aucBytes[iIndex]))) {
return TRUE;
}
}
return FALSE;
} /* end of bIsMacWord45File */
/*
* iGuessVersionNumber - guess the Word version number from first few bytes
*
* Returns the guessed version number or -1 when no guess it possible
*/
int
iGuessVersionNumber(FILE *pFile, long lFilesize)
{
if(bIsWordForDosFile(pFile, lFilesize)) {
return 0;
}
if (bIsWinWord12File(pFile, lFilesize)) {
return 2;
}
if (bIsMacWord45File(pFile)) {
return 5;
}
if (bIsWordFileWithOLE(pFile, lFilesize)) {
return 6;
}
return -1;
} /* end of iGuessVersionNumber */
/*
* iGetVersionNumber - get the Word version number from the header
*
* Returns the version number or -1 when unknown
*/
int
iGetVersionNumber(const UCHAR *aucHeader)
{
USHORT usFib, usChse;
usFib = usGetWord(0x02, aucHeader);
if (usFib >= 0x1000) {
/* To big: must be MacWord using Big Endian */
DBG_HEX(usFib);
usFib = usGetWordBE(0x02, aucHeader);
}
DBG_DEC(usFib);
bOldMacFile = FALSE;
switch (usFib) {
case 0:
DBG_MSG("Word for DOS");
return 0;
case 28:
DBG_MSG("Word 4 for Macintosh");
bOldMacFile = TRUE;
return 4;
case 33:
DBG_MSG("Word 1.x for Windows");
return 1;
case 35:
DBG_MSG("Word 5 for Macintosh");
bOldMacFile = TRUE;
return 5;
case 45:
DBG_MSG("Word 2 for Windows");
return 2;
case 101:
case 102:
DBG_MSG("Word 6 for Windows");
return 6;
case 103:
case 104:
usChse = usGetWord(0x14, aucHeader);
DBG_DEC(usChse);
switch (usChse) {
case 0:
DBG_MSG("Word 7 for Win95");
return 7;
case 256:
DBG_MSG("Word 6 for Macintosh");
bOldMacFile = TRUE;
return 6;
default:
DBG_FIXME();
if ((int)ucGetByte(0x05, aucHeader) == 0xe0) {
DBG_MSG("Word 7 for Win95");
return 7;
}
DBG_MSG("Word 6 for Macintosh");
bOldMacFile = TRUE;
return 6;
}
default:
usChse = usGetWord(0x14, aucHeader);
DBG_DEC(usChse);
if (usFib < 192) {
/* Unknown or unsupported version of Word */
DBG_DEC(usFib);
return -1;
}
DBG_MSG_C(usChse != 256, "Word97 for Win95/98/NT");
DBG_MSG_C(usChse == 256, "Word98 for Macintosh");
return 8;
}
} /* end of iGetVersionNumber */
/*
* TRUE if the current file was made by Word version 6 or older on an
* Apple Macintosh, otherwise FALSE.
* This function hides the methode of how to find out from the rest of the
* program.
*/
BOOL
bIsOldMacFile(void)
{
return bOldMacFile;
} /* end of bIsOldMacFile */
/*
* iInitDocument - initialize a document
*
* Returns the version of Word that made the document or -1
*/
int
iInitDocument(FILE *pFile, long lFilesize)
{
int iGuess, iWordVersion;
iGuess = iGuessVersionNumber(pFile, lFilesize);
switch (iGuess) {
case 0:
iWordVersion = iInitDocumentDOS(pFile, lFilesize);
break;
case 2:
iWordVersion = iInitDocumentWIN(pFile, lFilesize);
break;
case 5:
iWordVersion = iInitDocumentMAC(pFile, lFilesize);
break;
case 6:
iWordVersion = iInitDocumentOLE(pFile, lFilesize);
break;
default:
DBG_DEC(iGuess);
iWordVersion = -1;
break;
}
return iWordVersion;
} /* end of iInitDocument */
/*
* vFreeDocument - free a document by free-ing its parts
*/
void
vFreeDocument(void)
{
DBG_MSG("vFreeDocument");
/* Free the memory */
vDestroyTextBlockList();
vDestroyDataBlockList();
vDestroyListInfoList();
vDestroyRowInfoList();
vDestroyStyleInfoList();
vDestroyFontInfoList();
vDestroyStylesheetList();
vDestroyPictInfoList();
vDestroyDocumentInfoList();
vDestroySectionInfoList();
vDestroyHdrFtrInfoList();
vDestroyPropModList();
vDestroyNotesInfoLists();
vDestroyFontTable();
vDestroySummaryInfo();
} /* end of vFreeDocument */
|