Add Working with extremely large text files


Memory Mapped Files:

I was in need of searching a 100 MB text file, and as I tried different methods for searching through it. My machine always crashed or took forever.

I received help from Madshi, who showed me how to do Memory Mapping of text files. He also gave me a function called PosPChar which is from his MadBasic unit set. This function allows you to search through a Memory Mapped file just like a Pos function would do to a regular text file.

You can find more about Madshi and his MadBasic at this website:

http://www.madshi.net/

Because this would be a lot of code duplicated, I am going to post the entire program below. Hopefully you can test this and modify as needed.

First open a new project. Save project as TextExtract.dpr, then save Unit1 as uMain.pas and name uMain.pas as fMain.

Drop the following components onto your form.

OpenDialog1
Richedit1 - name as "reFile"
Button1 - name as "btnOpen"
Button2 - name as "btnFind"
CheckBox1 - name as "cbCase"
Edit1 - name as "eNumber"
Edit2 - name as "eString"
Label1 - name as "lblRecord"
Label2 - name as "lblStringToFind"

Now, just copy the code the code below and then associate the following events:

btnOpen.OnClick;
btnFind.OnClick;
fMain.OnDestroy;

Once this has been done you can load a huge text file and test this program. Now this program was designed for a project that I had. It searchs for the string that is typed into eString, and eNumber tells what instance.

For example: Let's say I type in "2" into eNumber. I then type "and" into the eString. It will then search through this line of code:

"Hello World and My Place and Delphi is cool and I love it."

and return the following:

"and My Place and"

because for my program I needed to search for the second instance of "and" as the end, and show everything in between the "and" directly before it.

You can customize the code however you wish to search it however desired.

THE CODE

  1. unit uMain;
  2.  
  3. interface
  4.  
  5. uses
  6.   Windows, Forms, SysUtils, Dialogs, StdCtrls, ComCtrls,
  7.   Controls, ExtCtrls, Classes;
  8.  
  9. type
  10.   TfMain = class(TForm)
  11.     OpenDialog1: TOpenDialog;
  12.     reFile: TRichEdit;
  13.     btnOpen: TButton;
  14.     cbCase: TCheckBox;
  15.     btnFind: TButton;
  16.     lblStringToFind: TLabel;
  17.     eString: TEdit;
  18.     lblRecord: TLabel;
  19.     eNumber: TEdit;
  20.     procedure btnOpenClick(Sender: TObject);
  21.     procedure btnFindClick(Sender: TObject);
  22.     procedure FormDestroy(Sender: TObject);
  23.   private
  24.     fh, map: dword;
  25.     bufsize: longword;
  26.     buf : pointer;
  27.     { Private declarations }
  28.   public
  29.     { Public declarations }
  30.   end;
  31.  
  32. var
  33.   fMain: TfMain;
  34.  
  35. implementation
  36.  
  37. {$R *.DFM}
  38.  
  39. var lowCharTable : array [#0..#$FF] of char =
  40.   (#$00,#$01,#$02,#$03,#$04,#$05,#$06,#$07,#$08,#$09,#$0A,#$0B,#$0C,#$0D,#$0E,#$0F,
  41.    #$10,#$11,#$12,#$13,#$14,#$15,#$16,#$17,#$18,#$19,#$1A,#$1B,#$1C,#$1D,#$1E,#$1F,
  42.    #$20,#$21,#$22,#$23,#$24,#$25,#$26,#$27,#$28,#$29,#$2A,#$2B,#$2C,#$2D,#$2E,#$2F,
  43.    #$30,#$31,#$32,#$33,#$34,#$35,#$36,#$37,#$38,#$39,#$3A,#$3B,#$3C,#$3D,#$3E,#$3F,
  44.    #$40,#$61,#$62,#$63,#$64,#$65,#$66,#$67,#$68,#$69,#$6A,#$6B,#$6C,#$6D,#$6E,#$6F,
  45.    #$70,#$71,#$72,#$73,#$74,#$75,#$76,#$77,#$78,#$79,#$7A,#$5B,#$5C,#$5D,#$5E,#$5F,
  46.    #$60,#$61,#$62,#$63,#$64,#$65,#$66,#$67,#$68,#$69,#$6A,#$6B,#$6C,#$6D,#$6E,#$6F,
  47.    #$70,#$71,#$72,#$73,#$74,#$75,#$76,#$77,#$78,#$79,#$7A,#$7B,#$7C,#$7D,#$7E,#$7F,
  48.    #$80,#$81,#$82,#$83,#$84,#$85,#$86,#$87,#$88,#$89,#$9A,#$8B,#$9C,#$8D,#$9E,#$8F,
  49.    #$90,#$91,#$92,#$93,#$94,#$95,#$96,#$97,#$98,#$99,#$9A,#$9B,#$9C,#$9D,#$9E,#$FF,
  50.    #$A0,#$A1,#$A2,#$A3,#$A4,#$A5,#$A6,#$A7,#$A8,#$A9,#$AA,#$AB,#$AC,#$AD,#$AE,#$AF,
  51.    #$B0,#$B1,#$B2,#$B3,#$B4,#$B5,#$B6,#$B7,#$B8,#$B9,#$BA,#$BB,#$BC,#$BD,#$BE,#$BF,
  52.    #$E0,#$E1,#$E2,#$E3,#$E4,#$E5,#$E6,#$E7,#$E8,#$E9,#$EA,#$EB,#$EC,#$ED,#$EE,#$EF,
  53.    #$F0,#$F1,#$F2,#$F3,#$F4,#$F5,#$F6,#$D7,#$F8,#$F9,#$FA,#$FB,#$FC,#$FD,#$FE,#$DF,
  54.    #$E0,#$E1,#$E2,#$E3,#$E4,#$E5,#$E6,#$E7,#$E8,#$E9,#$EA,#$EB,#$EC,#$ED,#$EE,#$EF,
  55.    #$F0,#$F1,#$F2,#$F3,#$F4,#$F5,#$F6,#$F7,#$F8,#$F9,#$FA,#$FB,#$FC,#$FD,#$FE,#$FF);
  56.  
  57. function PosPChar(subStr       : pchar;
  58.                   str          : pchar;
  59.                   subStrLen    : cardinal = 0;   // 0 -> StrLen is called internally
  60.                   strLen       : cardinal = 0;
  61.                   ignoreCase   : boolean  = false;
  62.                   fromPos      : cardinal = 0;
  63.                   toPos        : cardinal = high(cardinal)) : integer;
  64.  
  65.   function GetPCharLen(const pc: pchar) : cardinal; assembler;
  66.   asm
  67.     MOV     EDX,EDI
  68.     MOV     EDI,EAX
  69.     MOV     ECX,0FFFFFFFFH
  70.     XOR     AL,AL
  71.     REPNE   SCASB
  72.     MOV     EAX,0FFFFFFFEH
  73.     SUB     EAX,ECX
  74.     MOV     EDI,EDX
  75.   end;
  76.  
  77. var pc1, pc2, pc3, pc4, pc5, pc6 : pchar;
  78.     c1                           : cardinal;
  79.     ch1                          : char;
  80. begin
  81.   result := -1;
  82.   if (subStr  nil) and (subStr^  #0) and (str  nil) and (str^  #0) then begin
  83.     if subStrLen = 0 then subStrLen := GetPCharLen(subStr);
  84.     if    strLen = 0 then    strLen := GetPCharLen(   str);
  85.     dec(subStrLen);
  86.     if strLen >= subStrLen then begin
  87.       c1 := strLen - subStrLen;
  88.       if ignoreCase then ch1 := lowCharTable[subStr^]
  89.       else               ch1 :=              subStr^;
  90.       if fromPos > toPos then begin
  91.         if toPos  c1 then fromPos := c1;
  92.           pc1 := str + fromPos;
  93.           pc2 := str +   toPos;
  94.           pc3 := subStr + 1;
  95.           pc4 := subStr + subStrLen;
  96.           pc6 := pc3;
  97.           if ignoreCase then begin
  98.             while pc1 >= pc2 do
  99.               if lowCharTable[pc1^] = ch1 then begin
  100.                 inc(pc1);
  101.                 pc5 := pc1;
  102.                 while (pc3  pc4 then begin
  103.                   result := pc5 - pchar(str) - 1;
  104.                   break;
  105.                 end;
  106.                 pc3 := pc6;
  107.                 pc1 := pc5 - 2;
  108.               end else dec(pc1);
  109.           end else
  110.             while pc1 >= pc2 do
  111.               if pc1^ = ch1 then begin
  112.                 inc(pc1);
  113.                 pc5 := pc1;
  114.                 while (pc3  pc4 then begin
  115.                   result := pc5 - pchar(str) - 1;
  116.                   break;
  117.                 end;
  118.                 pc3 := pc6;
  119.                 pc1 := pc5 - 2;
  120.               end else dec(pc1);
  121.         end;
  122.       end else
  123.         if fromPos  c1 then toPos := c1;
  124.           pc1 := str + fromPos;
  125.           pc2 := str +   toPos;
  126.           pc3 := subStr + 1;
  127.           pc4 := subStr + subStrLen;
  128.           pc6 := pc3;
  129.           if ignoreCase then begin
  130.             while pc1  -1 do
  131.     begin
  132.         i := PosPChar(PChar(eString.Text), PChar(buf), Length(eString.Text), bufSize, cbCase.Checked, i)+1;
  133.         if i = 0 then
  134.         begin
  135.             i := -1;
  136.             reFile.Text := IntToStr(C)+' is the last instance found.';
  137.         end else
  138.             inc(C);
  139.         if c = (StrToInt(eNumber.Text)-1) then
  140.         begin
  141.             i2 := PosPChar(PChar(eString.Text), PChar(buf), Length(eString.Text), bufSize, cbCase.Checked, i);
  142.             if i2 > -1 then
  143.             begin
  144.                 reFile.Clear;
  145.                 S := '';
  146.                 SetString(S, pchar(integer(buf) + i-1), i2-i+Length(eString.Text)+1);
  147.                 reFile.Text := S;
  148.                 break;
  149.             end else
  150.             begin
  151.                 reFile.Lines.Text := IntToStr(C)+' is the last instance found.';
  152.                 break;
  153.             end;
  154.         end else if c > (StrToInt(eNumber.Text)-1) then
  155.         begin
  156.             reFile.Text := 'Not found in document.';
  157.             break;
  158.         end;
  159.     end;
  160.     end;
  161. end;
  162.  
  163. procedure TfMain.FormDestroy(Sender: TObject);
  164. begin
  165.     UnmapViewOfFile(buf);
  166.     CloseHandle(map);
  167.     CloseHandle(fh);
  168. end;
  169.  
  170. end.


END OF CODE

A few things I learned as I played around with the code. If you do not supply all the key parameters in the PosPChar Function it can slow it down 1000% on large files, because it will have to figure out the value each time it processes.

I was also told by Madshi that his Madbasic units let you search through the files backwards if needed.

Well, good luck and I realize that I don't have a great deal of explanation for the code, but hopefully you can figure how to modify it to your needs.
Related Discussions
  • USING DLLS (2001-01-03 23:47:50)
    Hi again Goober ... :o) Let's take NETAPI32.DLL as an example. This DLL is created by Microsoft and the explanation of its use is described in...
  • HELP WITH CHDIR !!!! TURBO PASCAL VS DELPHI (2001-01-03 06:48:49)
    This is because the ChDir Procedure, the SetCurrentDir and SetCurrentDirectory Functions are changing the directory in the active process. Try...
  • RUNNING A DATABASE APPLICATION ON ANOTHER COMPUTER (2001-01-07 05:36:33)
    In the BDE on the other PC (the one without the database), you must add an alias to point to the location of the database in the format of:...
  • ADD TO RESOURCE ON RUNTIME (2001-01-04 09:00:49)
    I think what you are asking is to change the .EXE file at run-time. Yes, it is possible, but nobody I found will share that information. If you...
  • HELP WITH EXCEPTIONS ..... I'M GETTING CRAZY! (2001-01-06 19:59:49)
    How to make it run as if you were running it outside of Delphi. Tools | Debugger Options --> Language Exceptions. Their is a checkbox titled...
  • TELEPHONE BASED APPLICATION (2001-01-08 10:29:39)
    I did something like this. I wrote a fully operative phone- based banking applciation in two days using VisualVoice and Dialogic cards. However,...
  • QUERY PARAMETER QUESTION...? (2001-01-04 22:44:03)
    there are two parts first is writing the script with variable and another is using bind variable eg- for first you can say query1.sql.text...
  • EXES CONTAINING MORE THAN ONE ICON RESOURCE ... (2001-01-05 09:25:58)
    Just draw your icons with the Borland Image Editor (Delivered with Delphi). It allows to save different icons in one ico file as well.
  • HOW TO LOAD/SAVE FROM/TO INI FILES WITHOUT COMPONENTS? (2001-01-05 11:55:47)
    Thx...:)
  • .BMP ISN'T A VALID PICTURE FILE FORMAT!? UHUM!? (2001-01-05 14:59:28)
    I think you have to Specify what line the text is in, like this. Image1.Picture.LoadFromFile(Memo1.Lines.Strings); If the text in the...
Latest News
Submit News Form Past News
Latest Forum Entries