/*
* findtext.c
* Copyright (C) 1998-2004 A.J. van Os; Released under GNU GPL
*
* Description:
* Find the blocks that contain the text of MS Word files
*/
#include <stdio.h>
#include <stdlib.h>
#include "antiword.h"
/*
* bAddTextBlocks - Add the blocks to the text block list
*
* Returns TRUE when successful, FALSE if not
*/
BOOL
bAddTextBlocks(ULONG ulCharPosFirst, ULONG ulTotalLength,
BOOL bUsesUnicode, USHORT usPropMod,
ULONG ulStartBlock, const ULONG *aulBBD, size_t tBBDLen)
{
text_block_type tTextBlock;
ULONG ulCharPos, ulOffset, ulIndex;
long lToGo;
fail(ulTotalLength > (ULONG)LONG_MAX / 2);
fail(ulStartBlock > MAX_BLOCKNUMBER && ulStartBlock != END_OF_CHAIN);
fail(aulBBD == NULL);
NO_DBG_HEX(ulCharPosFirst);
NO_DBG_DEC(ulTotalLength);
if (bUsesUnicode) {
/* One character equals two bytes */
NO_DBG_MSG("Uses Unicode");
lToGo = (long)ulTotalLength * 2;
} else {
/* One character equals one byte */
NO_DBG_MSG("Uses ASCII");
lToGo = (long)ulTotalLength;
}
ulCharPos = ulCharPosFirst;
ulOffset = ulCharPosFirst;
for (ulIndex = ulStartBlock;
ulIndex != END_OF_CHAIN && lToGo > 0;
ulIndex = aulBBD[ulIndex]) {
if (ulIndex >= (ULONG)tBBDLen) {
DBG_DEC(ulIndex);
DBG_DEC(tBBDLen);
werr(1, "The Big Block Depot is damaged");
}
if (ulOffset >= BIG_BLOCK_SIZE) {
ulOffset -= BIG_BLOCK_SIZE;
continue;
}
tTextBlock.ulFileOffset =
(ulIndex + 1) * BIG_BLOCK_SIZE + ulOffset;
tTextBlock.ulCharPos = ulCharPos;
tTextBlock.ulLength = min(BIG_BLOCK_SIZE - ulOffset,
(ULONG)lToGo);
tTextBlock.bUsesUnicode = bUsesUnicode;
tTextBlock.usPropMod = usPropMod;
ulOffset = 0;
if (!bAdd2TextBlockList(&tTextBlock)) {
DBG_HEX(tTextBlock.ulFileOffset);
DBG_HEX(tTextBlock.ulCharPos);
DBG_DEC(tTextBlock.ulLength);
DBG_DEC(tTextBlock.bUsesUnicode);
DBG_DEC(tTextBlock.usPropMod);
return FALSE;
}
ulCharPos += tTextBlock.ulLength;
lToGo -= (long)tTextBlock.ulLength;
}
DBG_DEC_C(lToGo != 0, lToGo);
return lToGo == 0;
} /* end of bAddTextBlocks */
/*
* bGet6DocumentText - make a list of the text blocks of Word 6/7 files
*
* Code for "fast saved" files.
*
* Returns TRUE when successful, FALSE if not
*/
BOOL
bGet6DocumentText(FILE *pFile, BOOL bUsesUnicode, ULONG ulStartBlock,
const ULONG *aulBBD, size_t tBBDLen, const UCHAR *aucHeader)
{
UCHAR *aucBuffer;
ULONG ulBeginTextInfo, ulTextOffset, ulTotLength;
size_t tTextInfoLen;
int iIndex, iType, iOff, iLen, iPieces;
USHORT usPropMod;
DBG_MSG("bGet6DocumentText");
fail(pFile == NULL);
fail(aulBBD == NULL);
fail(aucHeader == NULL);
ulBeginTextInfo = ulGetLong(0x160, aucHeader); /* fcClx */
DBG_HEX(ulBeginTextInfo);
tTextInfoLen = (size_t)ulGetLong(0x164, aucHeader); /* lcbClx */
DBG_DEC(tTextInfoLen);
aucBuffer = xmalloc(tTextInfoLen);
if (!bReadBuffer(pFile, ulStartBlock,
aulBBD, tBBDLen, BIG_BLOCK_SIZE,
aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
aucBuffer = xfree(aucBuffer);
return FALSE;
}
NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
iOff = 0;
while ((size_t)iOff < tTextInfoLen) {
iType = (int)ucGetByte(iOff, aucBuffer);
iOff++;
if (iType == 0) {
DBG_FIXME();
iOff++;
continue;
}
if (iType == 1) {
iLen = (int)usGetWord(iOff, aucBuffer);
vAdd2PropModList(aucBuffer + iOff);
iOff += iLen + 2;
continue;
}
if (iType != 2) {
werr(0, "Unknown type of 'fastsaved' format");
aucBuffer = xfree(aucBuffer);
return FALSE;
}
/* Type 2 */
iLen = (int)usGetWord(iOff, aucBuffer);
NO_DBG_DEC(iLen);
iOff += 4;
iPieces = (iLen - 4) / 12;
DBG_DEC(iPieces);
for (iIndex = 0; iIndex < iPieces; iIndex++) {
ulTextOffset = ulGetLong(
iOff + (iPieces + 1) * 4 + iIndex * 8 + 2,
aucBuffer);
usPropMod = usGetWord(
iOff + (iPieces + 1) * 4 + iIndex * 8 + 6,
aucBuffer);
ulTotLength = ulGetLong(iOff + (iIndex + 1) * 4,
aucBuffer) -
ulGetLong(iOff + iIndex * 4,
aucBuffer);
NO_DBG_HEX_C(usPropMod != 0, usPropMod);
if (!bAddTextBlocks(ulTextOffset, ulTotLength,
bUsesUnicode, usPropMod,
ulStartBlock,
aulBBD, tBBDLen)) {
aucBuffer = xfree(aucBuffer);
return FALSE;
}
}
break;
}
aucBuffer = xfree(aucBuffer);
return TRUE;
} /* end of bGet6DocumentText */
/*
* bGet8DocumentText - make a list of the text blocks of Word 8/97 files
*
* Returns TRUE when successful, FALSE if not
*/
BOOL
bGet8DocumentText(FILE *pFile, const pps_info_type *pPPS,
const ULONG *aulBBD, size_t tBBDLen,
const ULONG *aulSBD, size_t tSBDLen,
const UCHAR *aucHeader)
{
const ULONG *aulBlockDepot;
UCHAR *aucBuffer;
ULONG ulTextOffset, ulBeginTextInfo;
ULONG ulTotLength, ulLen;
long lIndex, lPieces, lOff;
size_t tTextInfoLen, tBlockDepotLen, tBlockSize;
int iType, iLen;
BOOL bUsesUnicode;
USHORT usPropMod;
DBG_MSG("bGet8DocumentText");
fail(pFile == NULL || pPPS == NULL);
fail(aulBBD == NULL || aulSBD == NULL);
fail(aucHeader == NULL);
ulBeginTextInfo = ulGetLong(0x1a2, aucHeader); /* fcClx */
DBG_HEX(ulBeginTextInfo);
tTextInfoLen = (size_t)ulGetLong(0x1a6, aucHeader); /* lcbClx */
DBG_DEC(tTextInfoLen);
DBG_DEC(pPPS->tTable.ulSB);
DBG_HEX(pPPS->tTable.ulSize);
if (pPPS->tTable.ulSize == 0) {
return FALSE;
}
if (pPPS->tTable.ulSize < MIN_SIZE_FOR_BBD_USE) {
/* Use the Small Block Depot */
aulBlockDepot = aulSBD;
tBlockDepotLen = tSBDLen;
tBlockSize = SMALL_BLOCK_SIZE;
} else {
/* Use the Big Block Depot */
aulBlockDepot = aulBBD;
tBlockDepotLen = tBBDLen;
tBlockSize = BIG_BLOCK_SIZE;
}
aucBuffer = xmalloc(tTextInfoLen);
if (!bReadBuffer(pFile, pPPS->tTable.ulSB,
aulBlockDepot, tBlockDepotLen, tBlockSize,
aucBuffer, ulBeginTextInfo, tTextInfoLen)) {
aucBuffer = xfree(aucBuffer);
return FALSE;
}
NO_DBG_PRINT_BLOCK(aucBuffer, tTextInfoLen);
lOff = 0;
while (lOff < (long)tTextInfoLen) {
iType = (int)ucGetByte(lOff, aucBuffer);
lOff++;
if (iType == 0) {
DBG_FIXME();
lOff++;
continue;
}
if (iType == 1) {
iLen = (int)usGetWord(lOff, aucBuffer);
vAdd2PropModList(aucBuffer + lOff);
lOff += (long)iLen + 2;
continue;
}
if (iType != 2) {
werr(0, "Unknown type of 'fastsaved' format");
aucBuffer = xfree(aucBuffer);
return FALSE;
}
/* Type 2 */
ulLen = ulGetLong(lOff, aucBuffer);
if (ulLen < 4) {
DBG_DEC(ulLen);
return FALSE;
}
lOff += 4;
lPieces = (long)((ulLen - 4) / 12);
DBG_DEC(lPieces);
for (lIndex = 0; lIndex < lPieces; lIndex++) {
ulTextOffset = ulGetLong(
lOff + (lPieces + 1) * 4 + lIndex * 8 + 2,
aucBuffer);
usPropMod = usGetWord(
lOff + (lPieces + 1) * 4 + lIndex * 8 + 6,
aucBuffer);
ulTotLength = ulGetLong(lOff + (lIndex + 1) * 4,
aucBuffer) -
ulGetLong(lOff + lIndex * 4,
aucBuffer);
if ((ulTextOffset & BIT(30)) == 0) {
bUsesUnicode = TRUE;
} else {
bUsesUnicode = FALSE;
ulTextOffset &= ~BIT(30);
ulTextOffset /= 2;
}
NO_DBG_HEX_C(usPropMod != 0, usPropMod);
if (!bAddTextBlocks(ulTextOffset, ulTotLength,
bUsesUnicode, usPropMod,
pPPS->tWordDocument.ulSB,
aulBBD, tBBDLen)) {
aucBuffer = xfree(aucBuffer);
return FALSE;
}
}
break;
}
aucBuffer = xfree(aucBuffer);
return TRUE;
} /* end of bGet8DocumentText */
|