13.15 Pcre Interface

ORIGIN 'betaenv';
BODY 'private/pcrebody';
(*
 * COPYRIGHT
 *       Copyright (C) Mjolner Informatics, 2000,2001,2002
 *       All rights reserved.
 *       Written by Erik Corry
 *)
--- lib: attributes ---

(*
 * Perl compatible regular expressions, based on Philip Hazel's PCRE
 * library.   See his documentation and perl documentation for details.
 * To activate the /i /x /m or /s options you can use the inline notation
 * (?x) notation either at the top level of the regular expression or
 * in a subexpression.  You can disable the options again with (?-x). You
 * can also use the comments below.
 * 
 * See also pcreDemo.bet in the basiclib/demo/pcre directory for some uses
 * for this stuff.
 *)

(* HOW TO DO SOME TYPICAL PERL THINGS
 *
 * Here are a few things that are very easy to do in perl with the
 * equivalent using BETA's perl-compatible regular expression support.
 * As you can see, the BETA version is often a little longer - this is
 * the penalty you pay for having a general purpose language.  You can
 * save some space at the expense of readability and perhaps efficiency
 * by initialising the Pcre object inline.
 *
 * Assume
 * pre: @Pcre;
 * ok: @boolean;
 *
 * Desc:  Test whether a string matches a pattern
 * Perl:  if $sample =~ /trigger/ ...
 * BETA:  'trigger' -> pre;
 *        (if sample[] -> pre.match then ... if)
 * Alternative:
 *        (if sample[] -> ('trigger' -> Pcre).match then ... if)
 *
 * Desc:  Replace a text in a string with another text
 * Perl:  $sample =~ s/gun/pistol/;
 * BETA:  'gun' -> pre;
 *        (sample[], 'pistol') -> pre.replace -> (ok, sample[]);
 *
 * For /g use replaceAll instead of replace
 * For /e use rep, see HTMLise in pcreDemo in ~beta/basiclib/demo/pcre
 *
 * Desc:  Test for case insensitive match
 * Perl:  if $sample =~ /trigger/i ...
 * BETA:  '(?i)trigger' -> pre;
 *        (if sample[] -> pre.match then ... if);
 * Alternative:
 *        'trigger' -> pre (# options:: (# do CASELESS #) #);
 *        (if sample[] -> pre.match then ... if);
 * Likewise for /x
 *
 * Desc:  Split an input line three ways into fields using : as separator
 * Perl:  ($wordone, $wordtwo, $rest) = split(/:/, $sample, 3);
 * BETA:  sample[] -> (':' -> Pcre).matchAll
 *        (#
 *           post:: (# do sp1 -> wordone[];
 *                        sp2 -> wordtwo[];
 *                        rest3 -> rest[];
 *                  #)
 *        #)
 * Alternative:
 *        sample[] -> (':' -> Pcre).matchAll
 *        (# post:: (# do ways3 -> (wordone[], wordtwo[], rest[]) #) #);
 *)



Pcre:
  (#
     <<SLOT PcreLib: attributes>>;
     
     compilation_error:< Exception
       (# regexp: ^text;
          errortext: ^text;
       enter (regexp[],errortext[])
       ...
       #);

     (* Options:  See pcre.h and doc *)
     pcre_CASELESS:       (# exit   1 #);
     pcre_MULTILINE:      (# exit   2 #);
     pcre_DOTALL:         (# exit   4 #);
     pcre_EXTENDED:       (# exit   8 #);
     pcre_ANCHORED:       (# exit  16 #);
     pcre_DOLLAR_ENDONLY: (# exit  32 #);
     pcre_EXTRA:          (# exit  64 #);
     pcre_NOTBOL:         (# exit 128 #);
     pcre_NOTEOL:         (# exit 256 #);
     pcre_UNGREEDY:       (# exit 512 #);
     pcre_NOTEMPTY:       (# exit 1024 #);

     pcre_NONBETAOPTIONS: (# exit 65535 #);

     (* Only in BETA library version *)
     (* Use non-localised English char classes *)
     (* You have to set this when compiling the regexp, not when matching *)
     pcre_C_LOCALE:       (# exit 65536 #);
     (* Study the regular expression after compiling it *)
     (* You have to set this when compiling the regexp, not when matching *)
     pcre_DO_STUDY:       (# exit 131072 #);
     (* Give none instead of zero length strings for cases where there is no
      * match.  This is more correct, but you have to program more carefully
      * to avoid runtime errors.
      *)
     pcre_RETURN_NONE:        (# exit 262144 #);

     pcre_MATCHOPTIONS:       (# exit pcre_NOTBOL %Bor
                                 pcre_NOTEOL %Bor
                                 pcre_NOTEMPTY %Bor
                                 pcre_RETURN_NONE #);

     (* For internal use *)
     pcre_INFO_OPTIONS:      (# exit  0 #);
     pcre_INFO_SIZE:         (# exit  1 #);
     pcre_INFO_CAPTURECOUNT: (# exit  2 #);
     pcre_INFO_BACKREFMAX:   (# exit  3 #);
     pcre_INFO_FIRSTCHAR:    (# exit  4 #);
     pcre_INFO_FIRSTTABLE:   (# exit  5 #);
     pcre_INFO_LASTLITERAL:  (# exit  6 #);

     pcre_ERROR_NOMATCH:     (# exit  -1 #);
     pcre_ERROR_NULL:        (# exit  -2 #);
     pcre_ERROR_BADOPTION:   (# exit  -3 #);
     pcre_ERROR_BADMAGIC:    (# exit  -4 #);
     pcre_ERROR_UNKNOWN_NODE:(# exit  -5 #);
     pcre_ERROR_NOMEMORY:    (# exit  -6 #);
     pcre_ERROR_NOSUBSTRING: (# exit  -7 #);

     (* Private internal state *)
     private: @...;

     (* Read-only for users of pcre.  Tells you how many subpatterns your
      * pattern has.  Only useful if you are reading regular expressions from
      * a config file or from the user, since otherwise you should know this
      * figure already :-]
      *)
     subPatterns: @Integer;

     (* Specialise this in order to give options when compiling the
      * regular expression and default options when matching.
      *)
     options:< integerValue
       (#
          (* Options:  See above *)
          CASELESS:       (# do value %Bor 1 -> value #);
          MULTILINE:      (# do value %Bor 2 -> value #);
          DOTALL:         (# do value %Bor 4 -> value #);
          EXTENDED:       (# do value %Bor 8 -> value #);
          ANCHORED:       (# do value %Bor 16 -> value #);
          DOLLAR_ENDONLY: (# do value %Bor 32 -> value #);
          EXTRA:          (# do value %Bor 64 -> value #);
          NOTBOL:         (# do value %Bor 128 -> value #);
          NOTEOL:         (# do value %Bor 256 -> value #);
          UNGREEDY:       (# do value %Bor 512 -> value #);
          NOTEMPTY:       (# do value %Bor 1024 -> value #);
          C_LOCALE:       (# do value %Bor 65536 -> value #);
          DO_STUDY:       (# do value %Bor 131072 -> value #);
          RETURN_NONE:    (# do value %Bor 262144 -> value #);
          clearCASELESS:       (# do value %Band (%Bnot 1) -> value #);
          clearMULTILINE:      (# do value %Band (%Bnot 2) -> value #);
          clearDOTALL:         (# do value %Band (%Bnot 4) -> value #);
          clearEXTENDED:       (# do value %Band (%Bnot 8) -> value #);
          clearANCHORED:       (# do value %Band (%Bnot 16) -> value #);
          clearDOLLAR_ENDONLY: (# do value %Band (%Bnot 32) -> value #);
          clearEXTRA:          (# do value %Band (%Bnot 64) -> value #);
          clearNOTBOL:         (# do value %Band (%Bnot 128) -> value #);
          clearNOTEOL:         (# do value %Band (%Bnot 256) -> value #);
          clearUNGREEDY:       (# do value %Band (%Bnot 512) -> value #);
          clearNOTEMPTY:       (# do value %Band (%Bnot 1024) -> value #);
          clearC_LOCALE:       (# do value %Band (%Bnot 65536) -> value #);
          clearDO_STUDY:       (# do value %Band (%Bnot 131072) -> value #);
          clearRETURN_NONE:    (# do value %Band (%Bnot 262144) -> value #);

       do 0 -> value;
          INNER;
       #);

     init:
       (#
          exp: ^Text;
          
       enter exp[]
       ...
       #);

     match:
       (#
          result: @Integer;
          subMatchCounter: @Integer;
          nextSubMatchIndex:
            (#
            do subMatchCounter = subMatchCounter + 1;
            exit subMatchCounter
            #);

          (* Get (as an integer pair) the position of the text that matched
           * the regular expression in the original text.
           *)
          matchPos:
            (#
               start: @Integer;
               end: @Integer;
            ...
            exit (start, end)
            #);

          (* Get (as a text reference) the text that matched the regular
           * expression.
           *)
          matchText:
            (#
               result: ^Text;
            do
               matchPos -> subject.sub -> result[];
            exit result[]
            #);

          (* Get (as a text reference) the text before the text that matched
           * the regular expression.
           *)
          preMatchText:
            (#
               result: ^Text;
            ...
            exit result[]
            #);

          (* Get (as a text reference) the text after the text that matched
           * the regular expression.
           *)
          postMatchText:
            (#
               result: ^Text;
            ...
            exit result[]
            #);

          (* Get (as an integer pair) the position of the nth submatch in the
           * original text.  You get (0,0) if the nth subpattern didn't match.
           * (It is possible that the nth subpattern didn't match, even if
           * the pattern as a whole matched.  This is different from the
           * subpattern matching an empty string.)
           *)
          subMatchPos:
            (#
               index: @Integer;
               start: @Integer;
               end: @Integer;
            enter index
            ...
            exit (start, end)
            #);

          (* Get (as an integer pair) the position of the next submatch in the
           * original text.  You get (0,0) if the next subpattern didn't match.
           * (It is possible that the nth subpattern didn't match, even if
           * the pattern as a whole matched.  This is different from the
           * subpattern matching an empty string.)
           *)
          nextSubMatchPos:
            (#
            exit nextSubMatchIndex -> subMatchPos
            #);

          (* Get (as a text reference) the position of the nth submatch in the
           * original text.  You get NONE if the nth subpattern didn't match and
           * you set the option.
           * (It is possible that the nth subpattern didn't match, even if
           * the pattern as a whole matched.  This is different from the
           * subpattern matching an empty string.)
           *)
          subMatchText:
            (#
               index: @Integer;
               start: @Integer;
               end: @Integer;
               result: ^Text;
            enter index
            ...
            exit result[]
            #);

          (* Get (as a text reference) the position of the next submatch in the
           * original text.  You get NONE if the next subpattern didn't match
           * and you set the option.
           * (It is possible that the nth subpattern didn't match, even if
           * the pattern as a whole matched.  This is different from the
           * subpattern matching an empty string.)
           *)
          nextSubMatchText:
            (#
            exit nextSubMatchIndex -> subMatchText
            #);

          (*
           * Shorthand methods to get a given matched subpattern 
           * You get NONE if the given subpattern didn't match and you set the
           * option.
           * (It is possible that the subpattern didn't match, even if
           * the pattern as a whole matched.  This is different from the
           * subpattern matching an empty string.)
           *)
          sub1: (# exit 1 -> subMatchText #);
          sub2: (# exit 2 -> subMatchText #);
          sub3: (# exit 3 -> subMatchText #);
          sub4: (# exit 4 -> subMatchText #);
          sub5: (# exit 5 -> subMatchText #);
          sub6: (# exit 6 -> subMatchText #);
          sub7: (# exit 7 -> subMatchText #);
          sub8: (# exit 8 -> subMatchText #);
          sub9: (# exit 9 -> subMatchText #);

          (* Gets called if there is no match at all.  I'm sure you can think
           * of something useful to put here.
           *)
          noMatch:<(# do INNER; #);

          (* Specialise this in order to start at a position other than the
           * start of the string
           *)
          position:< integerValue
            (# 
            do 1 -> value;
               INNER;
            #);

          (* Specialise this in order to give options when executing the
           * regular expression.  Doesn't work for options used to compile
           * the regular expression, you had to give them earlier.  If you
           * don't specialise this then you get the global options for this
           * pcre object.
           *)
          options:< integerValue
            (#
               (* Options:  See above
                * Only the options that are useful at match-time (as opposed to
                * init-time) are here
                *)
               NOTBOL:         (# do value %Bor 128 -> value #);
               NOTEOL:         (# do value %Bor 256 -> value #);
               NOTEMPTY:       (# do value %Bor 1024 -> value #);
               RETURN_NONE:    (# do value %Bor 262144 -> value #);
               clearNOTBOL:    (# do value %Band (%Bnot 128) -> value #);
               clearNOTEOL:    (# do value %Band (%Bnot 256) -> value #);
               clearNOTEMPTY:  (# do value %Band (%Bnot 1024) -> value #);
               clearRETURN_NONE:(# do value %Band (%Bnot 262144) -> value #);
            do THIS(pcre).options %Band pcre_MATCHOPTIONS -> value;
               INNER;
            #);

          (* Called before the first match is attempted
           *)
          pre:<
            (# do INNER; #);

          (* match
           * Enter a text reference into the regular expression.  Returns true or
           * false according to whether the text matched the expression.  Executes
           * INNER if there is a match.
           *)
          subject: ^Text;
          matched: @Boolean;
          opt: @Integer;
          psn: @Integer;
       enter subject[]
       ...
       exit (matched)
       #);

     (*
      * matchAll: match
      * Keeps matching as many times as possible until there are no more matches
      * or the end of the string is reached.  Returns true if at least one match
      * occurs.  Calls INNER for each match.
      *)
     matchAll: match
       (#
          privatema: @...;

          (* The number og matches we have so far.  This can be queried in split
           * (where it is always one less than the number of matches except the
           * last time) or in the INNER part of matchAll, where it is accurate.
           *)
          matches: @Integer;

          pre::<
            (#
            ...
            #);

          (* Get (as a text reference) the text after the previous match (if any)
           * but before the text that matched the regular expression this time
           * around.
           *)
          splitText:
            (#
               result: ^Text;
            do
               splitPos -> subject.sub -> result[];
            exit result[]
            #);

          (* Get (as an integer pair) the position of the text after the previous
           * match (if any) but before the text that matched the regular
           * expression this time around.
           *)
          splitPos:
            (#
               start: @Integer;
               end: @Integer;
            ...
            exit (start, end)
            #);

          (* Gets called once for each split and once at the end.  You can call
           * splitText and splitPos from here to do something with the split
           * strings.  Gets called only once if the pattern doesn't match at all.
           *)
          split:<
            (#
               thismatch: @Integer;
            ...
            #);

          (* Make sure split and post get called at least once even if there
           * is no match at all.  You can add code here if you want to do
           * something whenever there is no match at all.
           *)
          noMatch::<
            (#
            ...
            #);

          (* Gets called once at the end.  You can call
           * splitText and splitPos from here to do something with the rest
           * You can also call spn, sp1, sp2, etc to get the first,
           * second etc. split text.  Restn, rest1, rest2 are similar, but they
           * get the rest of the string from the start of the nth split text to
           * the end.
           *)
          post:<
            (#
               spn:<
                 (#
                    num: @Integer;
                    result: ^Text;
                 enter num
                 ...
                 exit result[]
                 #);
               sp1: (# exit 1-> spn #);
               sp2: (# exit 2-> spn #);
               sp3: (# exit 3-> spn #);
               sp4: (# exit 4-> spn #);
               sp5: (# exit 5-> spn #);
               sp6: (# exit 6-> spn #);
               sp7: (# exit 7-> spn #);
               sp8: (# exit 8-> spn #);
               sp9: (# exit 9-> spn #);
               restn:<
                 (#
                    num: @Integer;
                    result: ^Text;
                 enter num
                 ...
                 exit result[]
                 #);
               rest1: (# exit 1-> restn #);
               rest2: (# exit 2-> restn #);
               rest3: (# exit 3-> restn #);
               rest4: (# exit 4-> restn #);
               rest5: (# exit 5-> restn #);
               rest6: (# exit 6-> restn #);
               rest7: (# exit 7-> restn #);
               rest8: (# exit 8-> restn #);
               rest9: (# exit 9-> restn #);
               ways2: (# exit (sp1, rest2) #);
               ways3: (# exit (sp1, sp2, rest3) #);
               ways4: (# exit (sp1, sp2, sp3, rest4) #);
               ways5: (# exit (sp1, sp2, sp3, sp4, rest5) #);
               ways6: (# exit (sp1, sp2, sp3, sp4, sp5, rest6) #);
               ways7: (# exit (sp1, sp2, sp3, sp4, sp5, sp6, rest7) #);
               ways8: (# exit (sp1, sp2, sp3, sp4, sp5, sp6, sp7, rest8) #);
               ways9: (# exit (sp1, sp2, sp3, sp4, sp5, sp6, sp7, sp8, rest9) #);
            do INNER;
            #);

       ...
       #);

     (*
      * Replace: match
      * Enter a reference to a text and a replacement string.  Exits a sucess
      * boolean and a text reference to the new string.  If there is no match
      * then false, plus a reference to a copy of the original string is exited.
      *)
     replace: match
       (#
          (* By overriding this you can put a different value in replacement,
           * so that the replacement text can be calculated dynamically (based
           * on eg. the contents or position of the matched or submatched texts).
           * (You can call matchText to get the text that matched)
           *)
          rep:< textObject;
          replacement: ^Text;
          new: ^Text;
          check:
            (# 
            do (if new[] = NONE then subject.copy -> new[] if);
            exit new[]
            #);
       enter replacement[]
       ...
       exit check
       #);

     (*
      * ReplaceAll: matchAll
      * Enter a reference to a text and a replacement string.  Exits a sucess
      * boolean and a text reference to the new string.  If there is no match
      * then false, plus a reference to a copy of the original string is exited. 
      *)
     replaceAll: matchAll
       (#
          (* By overriding this you can put a different value in replacement,
           * so that the replacement text can be calculated dynamically (based
           * on eg. the contents or position of the matched or submatched texts).
           * (You can call matchText to get the text that matched)
           *)
          rep:< textObject;

          post::<
            (#
            do (if new[]=NONE then &text[]->new[] if);            
               splitText -> new.append;
               INNER;
            #);

          replacement: ^Text;
          new: ^Text;
       enter replacement[]
       do
          (if new[] = NONE then
              splitText -> new[];
              subject.lgth -> new.extend;
           else
              splitText -> new.append;
          if);
          replacement[] -> rep -> new.append;
          INNER;
       exit
          (#
          do
             (if new[] = NONE then subject.copy -> new[] if);
          exit new[]
          #)
       #);

     (* Pcre itself enters init (which takes a text reference and compiles it
      * to a regular expression) and exits a reference to itself, which lets you
      * dynamically create a regexp and call a method on it in one line
      *)
  enter init
  exit this(Pcre)[]
  #)


13.15 Pcre Interface
© 1990-2004 Mjølner Informatics
[Modified: Monday October 20th 2003 at 16:10]