Add
Working with extremely large text files
Memory Mapped Files:
I was in need of searching a 100 MB text file, and as I tried different methods for searching through it. My machine always crashed or took forever.
I received help from Madshi, who showed me how to do Memory Mapping of text files. He also gave me a function called PosPChar which is from his MadBasic unit set. This function allows you to search through a Memory Mapped file just like a Pos function would do to a regular text file.
You can find more about Madshi and his MadBasic at this website:
http://www.madshi.net/
Because this would be a lot of code duplicated, I am going to post the entire program below. Hopefully you can test this and modify as needed.
First open a new project. Save project as TextExtract.dpr, then save Unit1 as uMain.pas and name uMain.pas as fMain.
Drop the following components onto your form.
OpenDialog1
Richedit1 - name as "reFile"
Button1 - name as "btnOpen"
Button2 - name as "btnFind"
CheckBox1 - name as "cbCase"
Edit1 - name as "eNumber"
Edit2 - name as "eString"
Label1 - name as "lblRecord"
Label2 - name as "lblStringToFind"
Now, just copy the code the code below and then associate the following events:
btnOpen.OnClick;
btnFind.OnClick;
fMain.OnDestroy;
Once this has been done you can load a huge text file and test this program. Now this program was designed for a project that I had. It searchs for the string that is typed into eString, and eNumber tells what instance.
For example: Let's say I type in "2" into eNumber. I then type "and" into the eString. It will then search through this line of code:
"Hello World and My Place and Delphi is cool and I love it."
and return the following:
"and My Place and"
because for my program I needed to search for the second instance of "and" as the end, and show everything in between the "and" directly before it.
You can customize the code however you wish to search it however desired.
THE CODE
unit uMain;
interface
uses
Windows, Forms, SysUtils, Dialogs, StdCtrls, ComCtrls,
Controls, ExtCtrls, Classes;
type
TfMain = class(TForm)
OpenDialog1: TOpenDialog;
reFile: TRichEdit;
btnOpen: TButton;
cbCase: TCheckBox;
btnFind: TButton;
lblStringToFind: TLabel;
eString: TEdit;
lblRecord: TLabel;
eNumber: TEdit;
procedure btnOpenClick(Sender: TObject);
procedure btnFindClick(Sender: TObject);
procedure FormDestroy(Sender: TObject);
private
fh, map: dword;
bufsize: longword;
buf : pointer;
{ Private declarations }
public
{ Public declarations }
end;
var
fMain: TfMain;
implementation
{$R *.DFM}
var lowCharTable : array [#0..#$FF] of char =
(#$00,#$01,#$02,#$03,#$04,#$05,#$06,#$07,#$08,#$09,#$0A,#$0B,#$0C,#$0D,#$0E,#$0F,
#$10,#$11,#$12,#$13,#$14,#$15,#$16,#$17,#$18,#$19,#$1A,#$1B,#$1C,#$1D,#$1E,#$1F,
#$20,#$21,#$22,#$23,#$24,#$25,#$26,#$27,#$28,#$29,#$2A,#$2B,#$2C,#$2D,#$2E,#$2F,
#$30,#$31,#$32,#$33,#$34,#$35,#$36,#$37,#$38,#$39,#$3A,#$3B,#$3C,#$3D,#$3E,#$3F,
#$40,#$61,#$62,#$63,#$64,#$65,#$66,#$67,#$68,#$69,#$6A,#$6B,#$6C,#$6D,#$6E,#$6F,
#$70,#$71,#$72,#$73,#$74,#$75,#$76,#$77,#$78,#$79,#$7A,#$5B,#$5C,#$5D,#$5E,#$5F,
#$60,#$61,#$62,#$63,#$64,#$65,#$66,#$67,#$68,#$69,#$6A,#$6B,#$6C,#$6D,#$6E,#$6F,
#$70,#$71,#$72,#$73,#$74,#$75,#$76,#$77,#$78,#$79,#$7A,#$7B,#$7C,#$7D,#$7E,#$7F,
#$80,#$81,#$82,#$83,#$84,#$85,#$86,#$87,#$88,#$89,#$9A,#$8B,#$9C,#$8D,#$9E,#$8F,
#$90,#$91,#$92,#$93,#$94,#$95,#$96,#$97,#$98,#$99,#$9A,#$9B,#$9C,#$9D,#$9E,#$FF,
#$A0,#$A1,#$A2,#$A3,#$A4,#$A5,#$A6,#$A7,#$A8,#$A9,#$AA,#$AB,#$AC,#$AD,#$AE,#$AF,
#$B0,#$B1,#$B2,#$B3,#$B4,#$B5,#$B6,#$B7,#$B8,#$B9,#$BA,#$BB,#$BC,#$BD,#$BE,#$BF,
#$E0,#$E1,#$E2,#$E3,#$E4,#$E5,#$E6,#$E7,#$E8,#$E9,#$EA,#$EB,#$EC,#$ED,#$EE,#$EF,
#$F0,#$F1,#$F2,#$F3,#$F4,#$F5,#$F6,#$D7,#$F8,#$F9,#$FA,#$FB,#$FC,#$FD,#$FE,#$DF,
#$E0,#$E1,#$E2,#$E3,#$E4,#$E5,#$E6,#$E7,#$E8,#$E9,#$EA,#$EB,#$EC,#$ED,#$EE,#$EF,
#$F0,#$F1,#$F2,#$F3,#$F4,#$F5,#$F6,#$F7,#$F8,#$F9,#$FA,#$FB,#$FC,#$FD,#$FE,#$FF);
function PosPChar(subStr : pchar;
str : pchar;
subStrLen : cardinal = 0; // 0 -> StrLen is called internally
strLen : cardinal = 0;
ignoreCase : boolean = false;
fromPos : cardinal = 0;
toPos : cardinal = high(cardinal)) : integer;
function GetPCharLen(const pc: pchar) : cardinal; assembler;
asm
MOV EDX,EDI
MOV EDI,EAX
MOV ECX,0FFFFFFFFH
XOR AL,AL
REPNE SCASB
MOV EAX,0FFFFFFFEH
SUB EAX,ECX
MOV EDI,EDX
end;
var pc1, pc2, pc3, pc4, pc5, pc6 : pchar;
c1 : cardinal;
ch1 : char;
begin
result := -1;
if (subStr nil) and (subStr^ #0) and (str nil) and (str^ #0) then begin
if subStrLen = 0 then subStrLen := GetPCharLen(subStr);
if strLen = 0 then strLen := GetPCharLen( str);
dec(subStrLen);
if strLen >= subStrLen then begin
c1 := strLen - subStrLen;
if ignoreCase then ch1 := lowCharTable[subStr^]
else ch1 := subStr^;
if fromPos > toPos then begin
if toPos c1 then fromPos := c1;
pc1 := str + fromPos;
pc2 := str + toPos;
pc3 := subStr + 1;
pc4 := subStr + subStrLen;
pc6 := pc3;
if ignoreCase then begin
while pc1 >= pc2 do
if lowCharTable[pc1^] = ch1 then begin
inc(pc1);
pc5 := pc1;
while (pc3 pc4 then begin
result := pc5 - pchar(str) - 1;
break;
end;
pc3 := pc6;
pc1 := pc5 - 2;
end else dec(pc1);
end else
while pc1 >= pc2 do
if pc1^ = ch1 then begin
inc(pc1);
pc5 := pc1;
while (pc3 pc4 then begin
result := pc5 - pchar(str) - 1;
break;
end;
pc3 := pc6;
pc1 := pc5 - 2;
end else dec(pc1);
end;
end else
if fromPos c1 then toPos := c1;
pc1 := str + fromPos;
pc2 := str + toPos;
pc3 := subStr + 1;
pc4 := subStr + subStrLen;
pc6 := pc3;
if ignoreCase then begin
while pc1 -1 do
begin
i := PosPChar(PChar(eString.Text), PChar(buf), Length(eString.Text), bufSize, cbCase.Checked, i)+1;
if i = 0 then
begin
i := -1;
reFile.Text := IntToStr(C)+' is the last instance found.';
end else
inc(C);
if c = (StrToInt(eNumber.Text)-1) then
begin
i2 := PosPChar(PChar(eString.Text), PChar(buf), Length(eString.Text), bufSize, cbCase.Checked, i);
if i2 > -1 then
begin
reFile.Clear;
S := '';
SetString(S, pchar(integer(buf) + i-1), i2-i+Length(eString.Text)+1);
reFile.Text := S;
break;
end else
begin
reFile.Lines.Text := IntToStr(C)+' is the last instance found.';
break;
end;
end else if c > (StrToInt(eNumber.Text)-1) then
begin
reFile.Text := 'Not found in document.';
break;
end;
end;
end;
end;
procedure TfMain.FormDestroy(Sender: TObject);
begin
UnmapViewOfFile(buf);
CloseHandle(map);
CloseHandle(fh);
end;
end.
END OF CODE
A few things I learned as I played around with the code. If you do not supply all the key parameters in the PosPChar Function it can slow it down 1000% on large files, because it will have to figure out the value each time it processes.
I was also told by Madshi that his Madbasic units let you search through the files backwards if needed.
Well, good luck and I realize that I don't have a great deal of explanation for the code, but hopefully you can figure how to modify it to your needs.
|
Latest News
Submit News Form
Past News
Latest Forum Entries
|