ORIGIN 'betaenv';
INCLUDE 'private/pcrelib';
BODY 'private/pcrebody';
(*
* COPYRIGHT
* Copyright (C) Mjolner Informatics, 2000
* All rights reserved.
* Written by Erik Corry
*)
--- lib: attributes ---
(*
* Perl compatible regular expressions, based on Philip Hazel's PCRE
* library. See his documentation and perl documentation for details.
* To activate the /i /x /m or /s options you can use the inline notation
* (?x) notation either at the top level of the regular expression or
* in a subexpression. You can disable the options again with (?-x). You
* can also use the comments below.
*
* See also pcreDemo.bet in the basiclib/demo/pcre directory for some uses
* for this stuff.
*)
(* HOW TO DO SOME TYPICAL PERL THINGS
*
* Here are a few things that are very easy to do in perl with the
* equivalent using BETA's perl-compatible regular expression support.
* As you can see, the BETA version is often a little longer - this is
* the penalty you pay for having a general purpose language. You can
* save some space at the expense of readability and perhaps efficiency
* by initialising the Pcre object inline.
*
* Assume
* pre: @Pcre;
* ok: @boolean;
*
* Desc: Test whether a string matches a pattern
* Perl: if $sample =~ /trigger/ ...
* BETA: 'trigger' -> pre;
* (if sample[] -> pre.match then ... if)
* Alternative:
* (if sample[] -> ('trigger' -> Pcre).match then ... if)
*
* Desc: Replace a text in a string with another text
* Perl: $sample =~ s/gun/pistol/;
* BETA: 'gun' -> pre;
* (sample[], 'pistol') -> pre.replace -> (ok, sample[]);
*
* For /g use replaceAll instead of replace
* For /e use rep, see HTMLise in pcreDemo in ~beta/basiclib/demo/pcre
*
* Desc: Test for case insensitive match
* Perl: if $sample =~ /trigger/i ...
* BETA: '(?i)trigger' -> pre;
* (if sample[] -> pre.match then ... if);
* Alternative:
* 'trigger' -> pre (# options:: (# do CASELESS #) #);
* (if sample[] -> pre.match then ... if);
* Likewise for /x
*
* Desc: Split an input line three ways into fields using : as separator
* Perl: ($wordone, $wordtwo, $rest) = split(/:/, $sample, 3);
* BETA: sample[] -> (':' -> Pcre).matchAll
* (#
* post:: (# do sp1 -> wordone[];
* sp2 -> wordtwo[];
* rest3 -> rest[];
* #)
* #)
* Alternative:
* sample[] -> (':' -> Pcre).matchAll
* (# post:: (# do ways3 -> (wordone[], wordtwo[], rest[]) #) #);
*)
Pcre:
(#
compilation_error:< Exception
(#
errortext: ^Text;
enter errortext[]
do INNER;
#);
(* Options: See pcre.h and doc *)
pcre_CASELESS: (# exit 1 #);
pcre_MULTILINE: (# exit 2 #);
pcre_DOTALL: (# exit 4 #);
pcre_EXTENDED: (# exit 8 #);
pcre_ANCHORED: (# exit 16 #);
pcre_DOLLAR_ENDONLY: (# exit 32 #);
pcre_EXTRA: (# exit 64 #);
pcre_NOTBOL: (# exit 128 #);
pcre_NOTEOL: (# exit 256 #);
pcre_UNGREEDY: (# exit 512 #);
pcre_NOTEMPTY: (# exit 1024 #);
pcre_NONBETAOPTIONS: (# exit 65535 #);
(* Only in BETA library version *)
(* Use non-localised English char classes *)
(* You have to set this when compiling the regexp, not when matching *)
pcre_C_LOCALE: (# exit 65536 #);
(* Study the regular expression after compiling it *)
(* You have to set this when compiling the regexp, not when matching *)
pcre_DO_STUDY: (# exit 131072 #);
(* Give none instead of zero length strings for cases where there is no
* match. This is more correct, but you have to program more carefully
* to avoid runtime errors.
*)
pcre_RETURN_NONE: (# exit 262144 #);
pcre_MATCHOPTIONS: (# exit pcre_NOTBOL %Bor
pcre_NOTEOL %Bor
pcre_NOTEMPTY %Bor
pcre_RETURN_NONE #);
(* For internal use *)
pcre_INFO_OPTIONS: (# exit 0 #);
pcre_INFO_SIZE: (# exit 1 #);
pcre_INFO_CAPTURECOUNT: (# exit 2 #);
pcre_INFO_BACKREFMAX: (# exit 3 #);
pcre_INFO_FIRSTCHAR: (# exit 4 #);
pcre_INFO_FIRSTTABLE: (# exit 5 #);
pcre_INFO_LASTLITERAL: (# exit 6 #);
pcre_ERROR_NOMATCH: (# exit -1 #);
pcre_ERROR_NULL: (# exit -2 #);
pcre_ERROR_BADOPTION: (# exit -3 #);
pcre_ERROR_BADMAGIC: (# exit -4 #);
pcre_ERROR_UNKNOWN_NODE:(# exit -5 #);
pcre_ERROR_NOMEMORY: (# exit -6 #);
pcre_ERROR_NOSUBSTRING: (# exit -7 #);
(* Private internal state *)
private: @...;
(* Read-only for users of pcre. Tells you how many subpatterns your
* pattern has. Only useful if you are reading regular expressions from
* a config file or from the user, since otherwise you should know this
* figure already :-]
*)
subPatterns: @Integer;
(* Specialise this in order to give options when compiling the
* regular expression and default options when matching.
*)
options:<(#
(* Options: See above *)
CASELESS: (# do value %Bor 1 -> value #);
MULTILINE: (# do value %Bor 2 -> value #);
DOTALL: (# do value %Bor 4 -> value #);
EXTENDED: (# do value %Bor 8 -> value #);
ANCHORED: (# do value %Bor 16 -> value #);
DOLLAR_ENDONLY: (# do value %Bor 32 -> value #);
EXTRA: (# do value %Bor 64 -> value #);
NOTBOL: (# do value %Bor 128 -> value #);
NOTEOL: (# do value %Bor 256 -> value #);
UNGREEDY: (# do value %Bor 512 -> value #);
NOTEMPTY: (# do value %Bor 1024 -> value #);
C_LOCALE: (# do value %Bor 65536 -> value #);
DO_STUDY: (# do value %Bor 131072 -> value #);
RETURN_NONE: (# do value %Bor 262144 -> value #);
clearCASELESS: (# do value %Band (%Bnot 1) -> value #);
clearMULTILINE: (# do value %Band (%Bnot 2) -> value #);
clearDOTALL: (# do value %Band (%Bnot 4) -> value #);
clearEXTENDED: (# do value %Band (%Bnot 8) -> value #);
clearANCHORED: (# do value %Band (%Bnot 16) -> value #);
clearDOLLAR_ENDONLY: (# do value %Band (%Bnot 32) -> value #);
clearEXTRA: (# do value %Band (%Bnot 64) -> value #);
clearNOTBOL: (# do value %Band (%Bnot 128) -> value #);
clearNOTEOL: (# do value %Band (%Bnot 256) -> value #);
clearUNGREEDY: (# do value %Band (%Bnot 512) -> value #);
clearNOTEMPTY: (# do value %Band (%Bnot 1024) -> value #);
clearC_LOCALE: (# do value %Band (%Bnot 65536) -> value #);
clearDO_STUDY: (# do value %Band (%Bnot 131072) -> value #);
clearRETURN_NONE: (# do value %Band (%Bnot 262144) -> value #);
value: @Integer;
do 0 -> value;
INNER;
exit value
#);
init:
(#
error: ^CString;
exp: ^Text;
opt: @Integer;
errtext: @Integer;
erroffset: @Integer;
enter exp[]
...
#);
match:
(#
result: @Integer;
subMatchCounter: @Integer;
nextSubMatchIndex:
(#
do subMatchCounter = subMatchCounter + 1;
exit subMatchCounter
#);
(* Get (as an integer pair) the position of the text that matched
* the regular expression in the original text.
*)
matchPos:
(#
start: @Integer;
end: @Integer;
...
exit (start, end)
#);
(* Get (as a text reference) the text that matched the regular
* expression.
*)
matchText:
(#
result: ^Text;
do
matchPos -> subject.sub -> result[];
exit result[]
#);
(* Get (as a text reference) the text before the text that matched
* the regular expression.
*)
preMatchText:
(#
result: ^Text;
...
exit result[]
#);
(* Get (as a text reference) the text after the text that matched
* the regular expression.
*)
postMatchText:
(#
result: ^Text;
...
exit result[]
#);
(* Get (as an integer pair) the position of the nth submatch in the
* original text. You get (0,0) if the nth subpattern didn't match.
* (It is possible that the nth subpattern didn't match, even if
* the pattern as a whole matched. This is different from the
* subpattern matching an empty string.)
*)
subMatchPos:
(#
index: @Integer;
start: @Integer;
end: @Integer;
enter index
...
exit (start, end)
#);
(* Get (as an integer pair) the position of the next submatch in the
* original text. You get (0,0) if the next subpattern didn't match.
* (It is possible that the nth subpattern didn't match, even if
* the pattern as a whole matched. This is different from the
* subpattern matching an empty string.)
*)
nextSubMatchPos:
(#
exit nextSubMatchIndex -> subMatchPos
#);
(* Get (as a text reference) the position of the nth submatch in the
* original text. You get NONE if the nth subpattern didn't match and
* you set the option.
* (It is possible that the nth subpattern didn't match, even if
* the pattern as a whole matched. This is different from the
* subpattern matching an empty string.)
*)
subMatchText:
(#
index: @Integer;
start: @Integer;
end: @Integer;
result: ^Text;
enter index
...
exit result[]
#);
(* Get (as a text reference) the position of the next submatch in the
* original text. You get NONE if the next subpattern didn't match
* and you set the option.
* (It is possible that the nth subpattern didn't match, even if
* the pattern as a whole matched. This is different from the
* subpattern matching an empty string.)
*)
nextSubMatchText:
(#
exit nextSubMatchIndex -> subMatchText
#);
(*
* Shorthand methods to get a given matched subpattern
* You get NONE if the given subpattern didn't match and you set the
* option.
* (It is possible that the subpattern didn't match, even if
* the pattern as a whole matched. This is different from the
* subpattern matching an empty string.)
*)
sub1: (# exit 1 -> subMatchText #);
sub2: (# exit 2 -> subMatchText #);
sub3: (# exit 3 -> subMatchText #);
sub4: (# exit 4 -> subMatchText #);
sub5: (# exit 5 -> subMatchText #);
sub6: (# exit 6 -> subMatchText #);
sub7: (# exit 7 -> subMatchText #);
sub8: (# exit 8 -> subMatchText #);
sub9: (# exit 9 -> subMatchText #);
(* Gets called if there is no match at all. I'm sure you can think
* of something useful to put here.
*)
noMatch:<
(# do INNER; #);
(* Specialise this in order to start at a position other than the
* start of the string
*)
position:<
(#
value: @Integer;
do 1 -> value;
INNER;
exit value
#);
(* Specialise this in order to give options when executing the
* regular expression. Doesn't work for options used to compile
* the regular expression, you had to give them earlier. If you
* don't specialise this then you get the global options for this
* pcre object.
*)
options:<
(#
(* Options: See above
* Only the options that are useful at match-time (as opposed to
* init-time) are here
*)
NOTBOL: (# do value %Bor 128 -> value #);
NOTEOL: (# do value %Bor 256 -> value #);
NOTEMPTY: (# do value %Bor 1024 -> value #);
RETURN_NONE: (# do value %Bor 262144 -> value #);
clearNOTBOL: (# do value %Band (%Bnot 128) -> value #);
clearNOTEOL: (# do value %Band (%Bnot 256) -> value #);
clearNOTEMPTY: (# do value %Band (%Bnot 1024) -> value #);
clearRETURN_NONE:(# do value %Band (%Bnot 262144) -> value #);
value: @Integer;
do THIS(pcre).options %Band pcre_MATCHOPTIONS -> value;
INNER;
exit value
#);
(* Called before the first match is attempted
*)
pre:<
(# do INNER; #);
(* match
* Enter a text reference into the regular expression. Returns true or
* false according to whether the text matched the expression. Executes
* INNER if there is a match.
*)
subject: ^Text;
matched: @Boolean;
opt: @Integer;
psn: @Integer;
enter subject[]
...
exit (matched)
#);
(*
* matchAll: match
* Keeps matching as many times as possible until there are no more matches
* or the end of the string is reached. Returns true if at least one match
* occurs. Calls INNER for each match.
*)
matchAll: match
(#
privatema: @...;
(* The number og matches we have so far. This can be queried in split
* (where it is always one less than the number of matches except the
* last time) or in the INNER part of matchAll, where it is accurate.
*)
matches: @Integer;
pre::<
(#
...
#);
(* Get (as a text reference) the text after the previous match (if any)
* but before the text that matched the regular expression this time
* around.
*)
splitText:
(#
result: ^Text;
do
splitPos -> subject.sub -> result[];
exit result[]
#);
(* Get (as an integer pair) the position of the text after the previous
* match (if any) but before the text that matched the regular
* expression this time around.
*)
splitPos:
(#
start: @Integer;
end: @Integer;
...
exit (start, end)
#);
(* Gets called once for each split and once at the end. You can call
* splitText and splitPos from here to do something with the split
* strings. Gets called only once if the pattern doesn't match at all.
*)
split:<
(#
thismatch: @Integer;
...
#);
(* Make sure split and post get called at least once even if there
* is no match at all. You can add code here if you want to do
* something whenever there is no match at all.
*)
noMatch::<
(#
...
#);
(* Gets called once at the end. You can call
* splitText and splitPos from here to do something with the rest
* You can also call spn, sp1, sp2, etc to get the first,
* second etc. split text. Restn, rest1, rest2 are similar, but they
* get the rest of the string from the start of the nth split text to
* the end.
*)
post:<
(#
spn:<
(#
num: @Integer;
result: ^Text;
enter num
...
exit result[]
#);
sp1: (# exit 1-> spn #);
sp2: (# exit 2-> spn #);
sp3: (# exit 3-> spn #);
sp4: (# exit 4-> spn #);
sp5: (# exit 5-> spn #);
sp6: (# exit 6-> spn #);
sp7: (# exit 7-> spn #);
sp8: (# exit 8-> spn #);
sp9: (# exit 9-> spn #);
restn:<
(#
num: @Integer;
result: ^Text;
enter num
...
exit result[]
#);
rest1: (# exit 1-> restn #);
rest2: (# exit 2-> restn #);
rest3: (# exit 3-> restn #);
rest4: (# exit 4-> restn #);
rest5: (# exit 5-> restn #);
rest6: (# exit 6-> restn #);
rest7: (# exit 7-> restn #);
rest8: (# exit 8-> restn #);
rest9: (# exit 9-> restn #);
ways2: (# exit (sp1, rest2) #);
ways3: (# exit (sp1, sp2, rest3) #);
ways4: (# exit (sp1, sp2, sp3, rest4) #);
ways5: (# exit (sp1, sp2, sp3, sp4, rest5) #);
ways6: (# exit (sp1, sp2, sp3, sp4, sp5, rest6) #);
ways7: (# exit (sp1, sp2, sp3, sp4, sp5, sp6, rest7) #);
ways8: (# exit (sp1, sp2, sp3, sp4, sp5, sp6, sp7, rest8) #);
ways9: (# exit (sp1, sp2, sp3, sp4, sp5, sp6, sp7, sp8, rest9) #);
do INNER;
#);
...
#);
(*
* Replace: match
* Enter a reference to a text and a replacement string. Exits a sucess
* boolean and a text reference to the new string. If there is no match
* then false, plus a reference to a copy of the original string is exited.
*)
replace: match
(#
(* By overriding this you can put a different value in replacement,
* so that the replacement text can be calculated dynamically (based
* on eg. the contents or position of the matched or submatched texts).
* (You can call matchText to get the text that matched)
*)
rep:<
(#
value: ^Text;
enter value[]
do INNER;
exit value[]
#);
replacement: ^Text;
new: ^Text;
enter replacement[]
...
exit
(#
do
(if new[] = NONE then subject.copy -> new[] if);
exit new[]
#)
#);
(*
* ReplaceAll: matchAll
* Enter a reference to a text and a replacement string. Exits a sucess
* boolean and a text reference to the new string. If there is no match
* then false, plus a reference to a copy of the original string is exited.
*)
replaceAll: matchAll
(#
(* By overriding this you can put a different value in replacement,
* so that the replacement text can be calculated dynamically (based
* on eg. the contents or position of the matched or submatched texts).
* (You can call matchText to get the text that matched)
*)
rep:<
(#
value: ^Text;
enter value[]
do INNER;
exit value[]
#);
post::<
(#
do splitText -> new.append;
INNER;
#);
replacement: ^Text;
new: ^Text;
enter replacement[]
do
(if new[] = NONE then
splitText -> new[];
subject.lgth -> new.extend;
else
splitText -> new.append;
if);
replacement[] -> rep -> new.append;
INNER;
exit
(#
do
(if new[] = NONE then subject.copy -> new[] if);
exit new[]
#)
#);
(* Pcre itself enters init (which takes a text reference and compiles it
* to a regular expression) and exits a reference to itself, which lets you
* dynamically create a regexp and call a method on it in one line
*)
enter init
exit this(Pcre)[]
#)
| 13.15 Pcre Interface | © 1990-2002 Mjølner Informatics |
[Modified: Wednesday January 10th 2001 at 16:28]
|