Icon View Thread

The following is the text of the current message along with any replies.
Messages 1 to 8 of 8 total
Thread Indexing XML documents
Tue, Oct 12 2010 10:35 AMPermanent Link

Kong Aw

I am storing XML documents in memo fields.  I don't want the node names to be included in the index but the same names may be in the text and I want them included.  Is there a way to do this?

KK Aw
Tue, Oct 12 2010 11:17 AMPermanent Link

Roy Lambert

NLH Associates

Team Elevate Team Elevate

Kong


You have to create a custom function to do it. Here's mine for altering html

The first thing you have to do is let the engine know it exists. I use the datamodule create event. It has to be done before you open any sessions, databases, or table.

procedure Tmnd.DataModuleCreate(Sender: TObject);
begin
with Engine do begin
 Active := False;
 MaxTableDataBufferSize := MaxTableDataBufferSize * 4;
 MaxTableDataBufferCount := MaxTableDataBufferCount * 4;
 MaxTableIndexBufferSize := MaxTableIndexBufferSize * 2;
 MaxTableIndexBufferCount := MaxTableIndexBufferCount * 2;
//MaxTableBlobBufferSize
//MaxTableBlobBufferCount
 LargeFileSupport := True;
 OnTextIndexFilter := ftiMassage;
 BeforeDeleteTrigger := BeforeDeleteTrigger;
 Active := True;
end;
end;

The function

procedure Tmnd.ftiMassage(Sender: TObject; const TableName, FieldName: string; var TextToIndex: string);
var
Cntr: integer;
ThisUn: string;
Skip: boolean;
WorkStr: string;
sl: TStringList;
slCnt: integer;
MinWordLength: integer;
MaxWordLength: integer;
ftiWebStuff: boolean;
ftiHelp: TDBISAMTable;
ftiSession: TDBISAMSession;
URLandEmail: string;
const
Delims = [' ', ',', '.', ';', ':', '!', '"', '?', '(', ')', '/', '\', '>', '<', '[', ']', '}', '{'];
Alphas = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', '-'];
function RemoveHTML(FormattedText: string): string;
var
 CharsToProcess: integer;
 CurrentChar: integer;
 TestChar: Char;
 function HTMLTag: string;
 var
  Tag: string;
  CRCheck: string;
  EqPos: integer;
  procedure FindClosingTag(WhatTag: string);
  var
   TagLength: integer;
   function NoTagMatch: boolean;
   var
    Cntr: integer;
   begin
    Result := False;
    for Cntr := 1 to TagLength do begin
     if WhatTag[Cntr] <> UpperCase(FormattedText[CurrentChar + Cntr]) then begin
      Result := True;
      Break;
     end;
    end;
   end;
  begin
   TagLength := Length(WhatTag);
   while (CurrentChar < CharsToProcess - TagLength) and (NoTagMatch) do inc(CurrentChar);
   CurrentChar := CurrentChar + TagLength + 1;
  end;
 begin
  Tag := '';
  while (FormattedText[CurrentChar] <> '>') and (CurrentChar <= CharsToProcess) do begin
   Tag := Tag + FormattedText[CurrentChar];
   inc(CurrentChar);
  end;
  Tag := Tag + FormattedText[CurrentChar]; //This should get the >
  Tag := UpperCase(Tag);
  if Tag = '<HEAD>' then begin
   FindClosingTag('</HEAD>');
  end else if Tag = '<XML>' then begin
   FindClosingTag('</XML>');
  end else if Tag = '<TITLE>' then begin
// We need to dump everything until the closing tag
   FindClosingTag('</TITLE>');
   Result := '';
  end else if Copy(Tag, 1, 6) = '<STYLE' then begin
   FindClosingTag('</STYLE>');
// We need to dump everything until the closing tag - especially css stuff
   Result := '';
  end else if Tag = '<BR>' then begin
   Result := #13#10;
  end else if Copy(Tag, 1, 2) = '<P' then begin
   Result := #13#10;
  end else if Tag = '</DIV>' then begin
   if CurrentChar < CharsToProcess then begin
    if (FormattedText[CurrentChar - 6] <> '>') and (FormattedText[CurrentChar - 10] <> '<') then Result := #13#10 else begin
     CRCheck := FormattedText[CurrentChar - 10] +
      FormattedText[CurrentChar - 9] +
      FormattedText[CurrentChar - 8] +
      FormattedText[CurrentChar - 7] +
      FormattedText[CurrentChar - 6];
     if UpperCase(CRCheck) <> '<DIV>' then Result := #13#10;
    end
   end else Result := '';
  end else if (Copy(Tag, 1, 3) = '</H') and (Tag[4] in ['0'..'9']) then begin
   Result := #13#10;
  end else Result := '';
 end;
 function SpecialChar: string;
 var
  HTMLChar: string;
 begin
  HTMLChar := '';
  while (FormattedText[CurrentChar] <> ';') and (CurrentChar <= CharsToProcess) do begin
   HTMLChar := HTMLChar + FormattedText[CurrentChar];
   inc(CurrentChar);
  end;
  HTMLChar := LowerCase(HTMLChar + FormattedText[CurrentChar]); //This should get the ;
  Result := '';
 end;
begin
 if 0 <> Pos('<html', LowerCase(FormattedText)) then begin
  Result := '';
  CharsToProcess := Length(FormattedText);
  CurrentChar := 1;
  while CurrentChar <= CharsToProcess do begin
   TestChar := FormattedText[CurrentChar];
   case TestChar of
    #0..#9, #11, #12, #14..#31: {do nothing};
    '<': Result := Result + HTMLTag;
    '&': Result := Result + SpecialChar;
   else Result := Result + TestChar;
   end;
   inc(CurrentChar);
  end;
 end else Result := FormattedText;
end;
function LineIsUUEncoded: boolean;
var
 uuCntr: integer;
const
 TableUU = '`!"#$%&''()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_';
begin
 Result := False;
 if (Length(WorkStr) > MinWordLength) and (((((Length(WorkStr) - 1) * 3) / 4) = Pos(WorkStr[1], TableUU) - 1) and (WorkStr[1] < 'a')) then begin
// Only if it hits here is there a possibility that its UUEncoded, but we need to check to make sure
  Result := True;
  for uuCntr := 1 to Length(WorkStr) do begin
   if 0 = Pos(WorkStr[uuCntr], TableUU) then begin
    Result := False;
    Break;
   end;
  end;
 end;
end;
function GetWebStuff: string;
var
 wsl: TStringList;
 wCntr: integer;
 lCntr: integer;
 Line: string;
 DelimPos: integer;
const
 wDelims = [' ', ',', ';', ':', '!', '"', '?', '(', ')', '/', '/', '>', '<', '[', ']', '}', '{'];
 function SeekEnd: string;
 var
  Chop: integer;
 begin
  Chop := lCntr + 1;
  while Chop <= Length(Line) do begin
   if (Line[Chop] in wDelims) then Break else inc(Chop);
  end;
  Result := Copy(Line, DelimPos, Chop - DelimPos) + ' ';
  Delete(Line, DelimPos, Chop - DelimPos);
  lCntr := DelimPos;
 end;
begin
{
Rules
1. If the line is less than 10 characters just forget it
2. Only interested in URLs starting www.
3. I'm only interested in the base URL ie if any / then chop before it
4. I'm only interested in one line at a time
}
 wsl := TStringList.Create;
 wsl.Text := TextToIndex;
 TextToIndex := '';
 Result := '';
 for wCntr := 0 to wsl.Count - 1 do begin
  DelimPos := 1;
  if Length(wsl[wCntr]) > 10 then begin
   Line := wsl[wCntr];
   lCntr := 1;
   while lCntr <= Length(Line) do begin
    if Line[lCntr] in wDelims then DelimPos := lCntr + 1;
    if Line[lCntr] = '@' then Result := Result + SeekEnd;
    if LowerCase(Copy(Line, lCntr, 4)) = 'www.' then Result := Result + SeekEnd;
    inc(lCntr);
   end;
   if Line <> '' then TextToIndex := TextToIndex + #13#10 + Line;
  end else TextToIndex := TextToIndex + #13#10 + wsl[wCntr];
 end;
 wsl.Free;
end;
procedure AddWordToBeIndexed;
begin
 if ThisUn = '' then Exit;
 if ThisUn[Length(ThisUn)] = '-' then Delete(ThisUn, Length(ThisUn), 1);
 if (Length(ThisUn) > MinWordLength) and (Length(ThisUn) <= MaxWordLength) then TextToIndex := TextToIndex + ThisUn + ' ';
 ThisUn := '';
end;
begin
if (TextToIndex = '') or (FieldName = '_Flags') then Exit;
if FieldName = '_Headers' then begin
 TextToIndex := GetWebStuff;
 Exit;
end;
ftiSession := MakeDBISAMSession;
ftiHelp := MakeDBISAMTable('ftiHelper', 'Memory', ftiSession);
ftiHelp.Open;
MinWordLength := ftiHelp.FieldByName('_MinWordLength').AsInteger;
MaxWordLength := ftiHelp.FieldByName('_MaxWordLength').AsInteger;
ftiWebStuff := ftiHelp.FieldByName('_ftiWebStuff').AsBoolean;
ftiHelp.Close;
ftiHelp.Free;
ftiSession.Free;
sl := TStringList.Create;
URLandEmail := GetWebStuff;
if FieldName = '_Message' then sl.Text := RemoveHTML(TextToIndex) else sl.Text := TextToIndex;
TextToIndex := '';
for slCnt := 0 to sl.Count - 1 do begin
 WorkStr := sl[slCnt];
 Skip := False;
 if LineIsUUEncoded then Break; // assumption is the rest of the message is UU stuff
 if Length(WorkStr) > MinWordLength then begin
  for Cntr := 1 to length(WorkStr) do begin
   if (not Skip) and (WorkStr[Cntr] in Alphas) then begin
    ThisUn := ThisUn + WorkStr[Cntr];
    Skip := false;
    if (Cntr = length(WorkStr)) or (WorkStr[Cntr + 1] in Delims) then AddWordToBeIndexed;
   end else begin
    if (Cntr = length(WorkStr)) or (WorkStr[Cntr + 1] in Delims) then AddWordToBeIndexed else begin
     Skip := true;
     ThisUn := '';
    end;
   end;
   if Skip then Skip := (Cntr < Length(WorkStr)) and (WorkStr[Cntr + 1] in Delims);
  end;
 end;
end;
if ftiWebStuff and (URLandEmail <> '') then TextToIndex := TextToIndex + URLandEmail;
if TextToIndex <> '' then Delete(TextToIndex, Length(TextToIndex), 1); // get rid of the trailing space
sl.Free;
end;

Roy Lambert [Team Elevate]
Tue, Oct 12 2010 6:57 PMPermanent Link

Kong Aw

Thanks Roy,  it works great and I have modified by DBSYS to extract only the text from XML documents.

I have an ISAPI application and I have an Initialization section as shown below:

initialization
 with Engine do
   begin
     Active := False;
     LargeFileSupport := True;
     EngineSignature := 'mctx70cx';
     TableDataTempExtension := '.edat'; {Dbisam 4}
     TableIndexTempExtension := '.eidx'; {Dbisam 4}
     TableBlobTempExtension := '.eblb'; {Dbisam 4}
     MaxTableDataBufferSize := MaxTableDataBufferSize * 4;
     MaxTableDataBufferCount := MaxTableDataBufferCount * 4;
     MaxTableIndexBufferSize := MaxTableIndexBufferSize * 2;
     MaxTableIndexBufferCount := MaxTableIndexBufferCount * 2;
     OnTextIndexFilter := TextIndexFilter;
     Active := True;
   end;

The compiler says that TextIndexFilter is undefined even though I have defined it in the unit.  Where should I define it.

Regards,
KK Aw

Roy Lambert wrote:

Kong


You have to create a custom function to do it. Here's mine for altering html

The first thing you have to do is let the engine know it exists. I use the datamodule create event. It has to be done before you open any sessions, databases, or table.
Wed, Oct 13 2010 4:40 AMPermanent Link

Roy Lambert

NLH Associates

Team Elevate Team Elevate

Kong


Since its an event it needs to be defined in the class definition of the datamodule, a bit like when you click on any event for a component and its autocreated for you.

Roy Lambert Roy Lambert [Team Elevate]
Wed, Oct 13 2010 5:49 AMPermanent Link

Kong Aw

Roy,

If I use the following code snippets:

type
 TEventHandlers = class // create a dummy class
   procedure TextIndexFilter(Sender: TObject; const TableName, FieldName: string; var TextToIndex: string);
 end;

procedure TEventHandlers.TextIndexFilter(Sender: TObject; const TableName, FieldName: string; var TextToIndex: string);
var
 Dom1: DomDocument60;
begin
....
end;

procedure InitMctWeb;
begin
 with Engine do
   begin
     Active := False;
     LargeFileSupport := True;
     EngineSignature := 'mctx70cx';
     TableDataTempExtension := '.edat'; {Dbisam 4}
     TableIndexTempExtension := '.eidx'; {Dbisam 4}
     TableBlobTempExtension := '.eblb'; {Dbisam 4}
     MaxTableDataBufferSize := MaxTableDataBufferSize * 4;
     MaxTableDataBufferCount := MaxTableDataBufferCount * 4;
     MaxTableIndexBufferSize := MaxTableIndexBufferSize * 2;
     MaxTableIndexBufferCount := MaxTableIndexBufferCount * 2;
     OnTextIndexFilter := TextIndexFilter;
     Active := True;
   end;
end;

Initialization
 InitMctweb;

I get the compiler error "Undeclared identifier 'TextIndexFilter".

If I do not declare TEventHandlers then then compiler error is "Incompatible Types: method pointer and regular procedure".

This is an ISAPI application.

I am lost.  
KK Aw
Wed, Oct 13 2010 6:14 AMPermanent Link

Roy Lambert

NLH Associates

Team Elevate Team Elevate

Kong


I'm sorry, I'm lost as well. I've never created an ISAPI application. The only solution I can think of would be if you have the source to bolt the textfilter code directly into the engine.

I do know that the approach you're trying won't work in a standard VCL app either.

How would you normally hook a components events to a component in an ISAPI app?


Roy Lambert [Team Elevate]
Thu, Oct 14 2010 11:01 AMPermanent Link

Kong Aw

Good news Roy.

I managed to get it working by doing the following:

a) In a new unit I add:

type
 TEventHandlers = class // create a dummy class
   procedure TextIndexFilter(Sender: TObject; const TableName, FieldName: string; var TextToIndex: string);
 end;
 var eh: TEventHandlers; //Global EventHandler variable (see Initialization)

b)  In the main webmodule unit I have:
initialization
 with Engine do
   begin
     Active := False;
     LargeFileSupport := True;
     EngineSignature := 'mctx70cx';
     TableDataTempExtension := '.edat'; {Dbisam 4}
     TableIndexTempExtension := '.eidx'; {Dbisam 4}
     TableBlobTempExtension := '.eblb'; {Dbisam 4}
     MaxTableDataBufferSize := MaxTableDataBufferSize * 4;
     MaxTableDataBufferCount := MaxTableDataBufferCount * 4;
     MaxTableIndexBufferSize := MaxTableIndexBufferSize * 2;
     MaxTableIndexBufferCount := MaxTableIndexBufferCount * 2;
     OnTextIndexFilter := eh.TextIndexFilter;
     Active := True;
   end;

It compiles and it is working so far.

KK Aw
Thu, Oct 14 2010 12:12 PMPermanent Link

Roy Lambert

NLH Associates

Team Elevate Team Elevate

Kong


Congratulations

Roy Lambert
Image