ORIGIN 'betaenv'; INCLUDE 'private/pcrelib'; BODY 'private/pcrebody'; (* * COPYRIGHT * Copyright (C) Mjolner Informatics, 2000 * All rights reserved. * Written by Erik Corry *) --- lib: attributes --- (* * Perl compatible regular expressions, based on Philip Hazel's PCRE * library. See his documentation and perl documentation for details. * To activate the /i /x /m or /s options you can use the inline notation * (?x) notation either at the top level of the regular expression or * in a subexpression. You can disable the options again with (?-x). You * can also use the comments below. * * See also pcreDemo.bet in the basiclib/demo/pcre directory for some uses * for this stuff. *) (* HOW TO DO SOME TYPICAL PERL THINGS * * Here are a few things that are very easy to do in perl with the * equivalent using BETA's perl-compatible regular expression support. * As you can see, the BETA version is often a little longer - this is * the penalty you pay for having a general purpose language. You can * save some space at the expense of readability and perhaps efficiency * by initialising the Pcre object inline. * * Assume * pre: @Pcre; * ok: @boolean; * * Desc: Test whether a string matches a pattern * Perl: if $sample =~ /trigger/ ... * BETA: 'trigger' -> pre; * (if sample[] -> pre.match then ... if) * Alternative: * (if sample[] -> ('trigger' -> Pcre).match then ... if) * * Desc: Replace a text in a string with another text * Perl: $sample =~ s/gun/pistol/; * BETA: 'gun' -> pre; * (sample[], 'pistol') -> pre.replace -> (ok, sample[]); * * For /g use replaceAll instead of replace * For /e use rep, see HTMLise in pcreDemo in ~beta/basiclib/demo/pcre * * Desc: Test for case insensitive match * Perl: if $sample =~ /trigger/i ... * BETA: '(?i)trigger' -> pre; * (if sample[] -> pre.match then ... if); * Alternative: * 'trigger' -> pre (# options:: (# do CASELESS #) #); * (if sample[] -> pre.match then ... if); * Likewise for /x * * Desc: Split an input line three ways into fields using : as separator * Perl: ($wordone, $wordtwo, $rest) = split(/:/, $sample, 3); * BETA: sample[] -> (':' -> Pcre).matchAll * (# * post:: (# do sp1 -> wordone[]; * sp2 -> wordtwo[]; * rest3 -> rest[]; * #) * #) * Alternative: * sample[] -> (':' -> Pcre).matchAll * (# post:: (# do ways3 -> (wordone[], wordtwo[], rest[]) #) #); *) Pcre: (# compilation_error:< Exception (# errortext: ^Text; enter errortext[] do INNER; #); (* Options: See pcre.h and doc *) pcre_CASELESS: (# exit 1 #); pcre_MULTILINE: (# exit 2 #); pcre_DOTALL: (# exit 4 #); pcre_EXTENDED: (# exit 8 #); pcre_ANCHORED: (# exit 16 #); pcre_DOLLAR_ENDONLY: (# exit 32 #); pcre_EXTRA: (# exit 64 #); pcre_NOTBOL: (# exit 128 #); pcre_NOTEOL: (# exit 256 #); pcre_UNGREEDY: (# exit 512 #); pcre_NOTEMPTY: (# exit 1024 #); pcre_NONBETAOPTIONS: (# exit 65535 #); (* Only in BETA library version *) (* Use non-localised English char classes *) (* You have to set this when compiling the regexp, not when matching *) pcre_C_LOCALE: (# exit 65536 #); (* Study the regular expression after compiling it *) (* You have to set this when compiling the regexp, not when matching *) pcre_DO_STUDY: (# exit 131072 #); (* Give none instead of zero length strings for cases where there is no * match. This is more correct, but you have to program more carefully * to avoid runtime errors. *) pcre_RETURN_NONE: (# exit 262144 #); pcre_MATCHOPTIONS: (# exit pcre_NOTBOL %Bor pcre_NOTEOL %Bor pcre_NOTEMPTY %Bor pcre_RETURN_NONE #); (* For internal use *) pcre_INFO_OPTIONS: (# exit 0 #); pcre_INFO_SIZE: (# exit 1 #); pcre_INFO_CAPTURECOUNT: (# exit 2 #); pcre_INFO_BACKREFMAX: (# exit 3 #); pcre_INFO_FIRSTCHAR: (# exit 4 #); pcre_INFO_FIRSTTABLE: (# exit 5 #); pcre_INFO_LASTLITERAL: (# exit 6 #); pcre_ERROR_NOMATCH: (# exit -1 #); pcre_ERROR_NULL: (# exit -2 #); pcre_ERROR_BADOPTION: (# exit -3 #); pcre_ERROR_BADMAGIC: (# exit -4 #); pcre_ERROR_UNKNOWN_NODE:(# exit -5 #); pcre_ERROR_NOMEMORY: (# exit -6 #); pcre_ERROR_NOSUBSTRING: (# exit -7 #); (* Private internal state *) private: @...; (* Read-only for users of pcre. Tells you how many subpatterns your * pattern has. Only useful if you are reading regular expressions from * a config file or from the user, since otherwise you should know this * figure already :-] *) subPatterns: @Integer; (* Specialise this in order to give options when compiling the * regular expression and default options when matching. *) options:<(# (* Options: See above *) CASELESS: (# do value %Bor 1 -> value #); MULTILINE: (# do value %Bor 2 -> value #); DOTALL: (# do value %Bor 4 -> value #); EXTENDED: (# do value %Bor 8 -> value #); ANCHORED: (# do value %Bor 16 -> value #); DOLLAR_ENDONLY: (# do value %Bor 32 -> value #); EXTRA: (# do value %Bor 64 -> value #); NOTBOL: (# do value %Bor 128 -> value #); NOTEOL: (# do value %Bor 256 -> value #); UNGREEDY: (# do value %Bor 512 -> value #); NOTEMPTY: (# do value %Bor 1024 -> value #); C_LOCALE: (# do value %Bor 65536 -> value #); DO_STUDY: (# do value %Bor 131072 -> value #); RETURN_NONE: (# do value %Bor 262144 -> value #); clearCASELESS: (# do value %Band (%Bnot 1) -> value #); clearMULTILINE: (# do value %Band (%Bnot 2) -> value #); clearDOTALL: (# do value %Band (%Bnot 4) -> value #); clearEXTENDED: (# do value %Band (%Bnot 8) -> value #); clearANCHORED: (# do value %Band (%Bnot 16) -> value #); clearDOLLAR_ENDONLY: (# do value %Band (%Bnot 32) -> value #); clearEXTRA: (# do value %Band (%Bnot 64) -> value #); clearNOTBOL: (# do value %Band (%Bnot 128) -> value #); clearNOTEOL: (# do value %Band (%Bnot 256) -> value #); clearUNGREEDY: (# do value %Band (%Bnot 512) -> value #); clearNOTEMPTY: (# do value %Band (%Bnot 1024) -> value #); clearC_LOCALE: (# do value %Band (%Bnot 65536) -> value #); clearDO_STUDY: (# do value %Band (%Bnot 131072) -> value #); clearRETURN_NONE: (# do value %Band (%Bnot 262144) -> value #); value: @Integer; do 0 -> value; INNER; exit value #); init: (# error: ^CString; exp: ^Text; opt: @Integer; errtext: @Integer; erroffset: @Integer; enter exp[] ... #); match: (# result: @Integer; subMatchCounter: @Integer; nextSubMatchIndex: (# do subMatchCounter = subMatchCounter + 1; exit subMatchCounter #); (* Get (as an integer pair) the position of the text that matched * the regular expression in the original text. *) matchPos: (# start: @Integer; end: @Integer; ... exit (start, end) #); (* Get (as a text reference) the text that matched the regular * expression. *) matchText: (# result: ^Text; do matchPos -> subject.sub -> result[]; exit result[] #); (* Get (as a text reference) the text before the text that matched * the regular expression. *) preMatchText: (# result: ^Text; ... exit result[] #); (* Get (as a text reference) the text after the text that matched * the regular expression. *) postMatchText: (# result: ^Text; ... exit result[] #); (* Get (as an integer pair) the position of the nth submatch in the * original text. You get (0,0) if the nth subpattern didn't match. * (It is possible that the nth subpattern didn't match, even if * the pattern as a whole matched. This is different from the * subpattern matching an empty string.) *) subMatchPos: (# index: @Integer; start: @Integer; end: @Integer; enter index ... exit (start, end) #); (* Get (as an integer pair) the position of the next submatch in the * original text. You get (0,0) if the next subpattern didn't match. * (It is possible that the nth subpattern didn't match, even if * the pattern as a whole matched. This is different from the * subpattern matching an empty string.) *) nextSubMatchPos: (# exit nextSubMatchIndex -> subMatchPos #); (* Get (as a text reference) the position of the nth submatch in the * original text. You get NONE if the nth subpattern didn't match and * you set the option. * (It is possible that the nth subpattern didn't match, even if * the pattern as a whole matched. This is different from the * subpattern matching an empty string.) *) subMatchText: (# index: @Integer; start: @Integer; end: @Integer; result: ^Text; enter index ... exit result[] #); (* Get (as a text reference) the position of the next submatch in the * original text. You get NONE if the next subpattern didn't match * and you set the option. * (It is possible that the nth subpattern didn't match, even if * the pattern as a whole matched. This is different from the * subpattern matching an empty string.) *) nextSubMatchText: (# exit nextSubMatchIndex -> subMatchText #); (* * Shorthand methods to get a given matched subpattern * You get NONE if the given subpattern didn't match and you set the * option. * (It is possible that the subpattern didn't match, even if * the pattern as a whole matched. This is different from the * subpattern matching an empty string.) *) sub1: (# exit 1 -> subMatchText #); sub2: (# exit 2 -> subMatchText #); sub3: (# exit 3 -> subMatchText #); sub4: (# exit 4 -> subMatchText #); sub5: (# exit 5 -> subMatchText #); sub6: (# exit 6 -> subMatchText #); sub7: (# exit 7 -> subMatchText #); sub8: (# exit 8 -> subMatchText #); sub9: (# exit 9 -> subMatchText #); (* Gets called if there is no match at all. I'm sure you can think * of something useful to put here. *) noMatch:< (# do INNER; #); (* Specialise this in order to start at a position other than the * start of the string *) position:< (# value: @Integer; do 1 -> value; INNER; exit value #); (* Specialise this in order to give options when executing the * regular expression. Doesn't work for options used to compile * the regular expression, you had to give them earlier. If you * don't specialise this then you get the global options for this * pcre object. *) options:< (# (* Options: See above * Only the options that are useful at match-time (as opposed to * init-time) are here *) NOTBOL: (# do value %Bor 128 -> value #); NOTEOL: (# do value %Bor 256 -> value #); NOTEMPTY: (# do value %Bor 1024 -> value #); RETURN_NONE: (# do value %Bor 262144 -> value #); clearNOTBOL: (# do value %Band (%Bnot 128) -> value #); clearNOTEOL: (# do value %Band (%Bnot 256) -> value #); clearNOTEMPTY: (# do value %Band (%Bnot 1024) -> value #); clearRETURN_NONE:(# do value %Band (%Bnot 262144) -> value #); value: @Integer; do THIS(pcre).options %Band pcre_MATCHOPTIONS -> value; INNER; exit value #); (* Called before the first match is attempted *) pre:< (# do INNER; #); (* match * Enter a text reference into the regular expression. Returns true or * false according to whether the text matched the expression. Executes * INNER if there is a match. *) subject: ^Text; matched: @Boolean; opt: @Integer; psn: @Integer; enter subject[] ... exit (matched) #); (* * matchAll: match * Keeps matching as many times as possible until there are no more matches * or the end of the string is reached. Returns true if at least one match * occurs. Calls INNER for each match. *) matchAll: match (# privatema: @...; (* The number og matches we have so far. This can be queried in split * (where it is always one less than the number of matches except the * last time) or in the INNER part of matchAll, where it is accurate. *) matches: @Integer; pre::< (# ... #); (* Get (as a text reference) the text after the previous match (if any) * but before the text that matched the regular expression this time * around. *) splitText: (# result: ^Text; do splitPos -> subject.sub -> result[]; exit result[] #); (* Get (as an integer pair) the position of the text after the previous * match (if any) but before the text that matched the regular * expression this time around. *) splitPos: (# start: @Integer; end: @Integer; ... exit (start, end) #); (* Gets called once for each split and once at the end. You can call * splitText and splitPos from here to do something with the split * strings. Gets called only once if the pattern doesn't match at all. *) split:< (# thismatch: @Integer; ... #); (* Make sure split and post get called at least once even if there * is no match at all. You can add code here if you want to do * something whenever there is no match at all. *) noMatch::< (# ... #); (* Gets called once at the end. You can call * splitText and splitPos from here to do something with the rest * You can also call spn, sp1, sp2, etc to get the first, * second etc. split text. Restn, rest1, rest2 are similar, but they * get the rest of the string from the start of the nth split text to * the end. *) post:< (# spn:< (# num: @Integer; result: ^Text; enter num ... exit result[] #); sp1: (# exit 1-> spn #); sp2: (# exit 2-> spn #); sp3: (# exit 3-> spn #); sp4: (# exit 4-> spn #); sp5: (# exit 5-> spn #); sp6: (# exit 6-> spn #); sp7: (# exit 7-> spn #); sp8: (# exit 8-> spn #); sp9: (# exit 9-> spn #); restn:< (# num: @Integer; result: ^Text; enter num ... exit result[] #); rest1: (# exit 1-> restn #); rest2: (# exit 2-> restn #); rest3: (# exit 3-> restn #); rest4: (# exit 4-> restn #); rest5: (# exit 5-> restn #); rest6: (# exit 6-> restn #); rest7: (# exit 7-> restn #); rest8: (# exit 8-> restn #); rest9: (# exit 9-> restn #); ways2: (# exit (sp1, rest2) #); ways3: (# exit (sp1, sp2, rest3) #); ways4: (# exit (sp1, sp2, sp3, rest4) #); ways5: (# exit (sp1, sp2, sp3, sp4, rest5) #); ways6: (# exit (sp1, sp2, sp3, sp4, sp5, rest6) #); ways7: (# exit (sp1, sp2, sp3, sp4, sp5, sp6, rest7) #); ways8: (# exit (sp1, sp2, sp3, sp4, sp5, sp6, sp7, rest8) #); ways9: (# exit (sp1, sp2, sp3, sp4, sp5, sp6, sp7, sp8, rest9) #); do INNER; #); ... #); (* * Replace: match * Enter a reference to a text and a replacement string. Exits a sucess * boolean and a text reference to the new string. If there is no match * then false, plus a reference to a copy of the original string is exited. *) replace: match (# (* By overriding this you can put a different value in replacement, * so that the replacement text can be calculated dynamically (based * on eg. the contents or position of the matched or submatched texts). * (You can call matchText to get the text that matched) *) rep:< (# value: ^Text; enter value[] do INNER; exit value[] #); replacement: ^Text; new: ^Text; enter replacement[] ... exit (# do (if new[] = NONE then subject.copy -> new[] if); exit new[] #) #); (* * ReplaceAll: matchAll * Enter a reference to a text and a replacement string. Exits a sucess * boolean and a text reference to the new string. If there is no match * then false, plus a reference to a copy of the original string is exited. *) replaceAll: matchAll (# (* By overriding this you can put a different value in replacement, * so that the replacement text can be calculated dynamically (based * on eg. the contents or position of the matched or submatched texts). * (You can call matchText to get the text that matched) *) rep:< (# value: ^Text; enter value[] do INNER; exit value[] #); post::< (# do splitText -> new.append; INNER; #); replacement: ^Text; new: ^Text; enter replacement[] do (if new[] = NONE then splitText -> new[]; subject.lgth -> new.extend; else splitText -> new.append; if); replacement[] -> rep -> new.append; INNER; exit (# do (if new[] = NONE then subject.copy -> new[] if); exit new[] #) #); (* Pcre itself enters init (which takes a text reference and compiles it * to a regular expression) and exits a reference to itself, which lets you * dynamically create a regexp and call a method on it in one line *) enter init exit this(Pcre)[] #)
13.15 Pcre Interface | © 1990-2002 Mjølner Informatics |
[Modified: Wednesday January 10th 2001 at 16:28]
|