13.15 Pcre Interface

ORIGIN 'betaenv';
INCLUDE 'private/pcrelib';
BODY 'private/pcrebody';
(*
 * COPYRIGHT
 *       Copyright (C) Mjolner Informatics, 2000
 *       All rights reserved.
 *       Written by Erik Corry
 *)
--- lib: attributes ---

  (*
   * Perl compatible regular expressions, based on Philip Hazel's PCRE
   * library.   See his documentation and perl documentation for details.
   * To activate the /i /x /m or /s options you can use the inline notation
   * (?x) notation either at the top level of the regular expression or
   * in a subexpression.  You can disable the options again with (?-x). You
   * can also use the comments below.
   * 
   * See also pcreDemo.bet in the basiclib/demo/pcre directory for some uses
   * for this stuff.
   *)

   (* HOW TO DO SOME TYPICAL PERL THINGS
    *
    * Here are a few things that are very easy to do in perl with the
    * equivalent using BETA's perl-compatible regular expression support.
    * As you can see, the BETA version is often a little longer - this is
    * the penalty you pay for having a general purpose language.  You can
    * save some space at the expense of readability and perhaps efficiency
    * by initialising the Pcre object inline.
    *
    * Assume
    * pre: @Pcre;
    * ok: @boolean;
    *
    * Desc:  Test whether a string matches a pattern
    * Perl:  if $sample =~ /trigger/ ...
    * BETA:  'trigger' -> pre;
    *        (if sample[] -> pre.match then ... if)
    * Alternative:
    *        (if sample[] -> ('trigger' -> Pcre).match then ... if)
    *
    * Desc:  Replace a text in a string with another text
    * Perl:  $sample =~ s/gun/pistol/;
    * BETA:  'gun' -> pre;
    *        (sample[], 'pistol') -> pre.replace -> (ok, sample[]);
    *
    * For /g use replaceAll instead of replace
    * For /e use rep, see HTMLise in pcreDemo in ~beta/basiclib/demo/pcre
    *
    * Desc:  Test for case insensitive match
    * Perl:  if $sample =~ /trigger/i ...
    * BETA:  '(?i)trigger' -> pre;
    *        (if sample[] -> pre.match then ... if);
    * Alternative:
    *        'trigger' -> pre (# options:: (# do CASELESS #) #);
    *        (if sample[] -> pre.match then ... if);
    * Likewise for /x
    *
    * Desc:  Split an input line three ways into fields using : as separator
    * Perl:  ($wordone, $wordtwo, $rest) = split(/:/, $sample, 3);
    * BETA:  sample[] -> (':' -> Pcre).matchAll
    *        (#
    *           post:: (# do sp1 -> wordone[];
    *                        sp2 -> wordtwo[];
    *                        rest3 -> rest[];
    *                  #)
    *        #)
    * Alternative:
    *        sample[] -> (':' -> Pcre).matchAll
    *        (# post:: (# do ways3 -> (wordone[], wordtwo[], rest[]) #) #);
    *)



Pcre:
(#
  compilation_error:< Exception
  (#
     errortext: ^Text;
  enter errortext[]
  do INNER;
  #);

  (* Options:  See pcre.h and doc *)
  pcre_CASELESS:       (# exit   1 #);
  pcre_MULTILINE:      (# exit   2 #);
  pcre_DOTALL:         (# exit   4 #);
  pcre_EXTENDED:       (# exit   8 #);
  pcre_ANCHORED:       (# exit  16 #);
  pcre_DOLLAR_ENDONLY: (# exit  32 #);
  pcre_EXTRA:          (# exit  64 #);
  pcre_NOTBOL:         (# exit 128 #);
  pcre_NOTEOL:         (# exit 256 #);
  pcre_UNGREEDY:       (# exit 512 #);
  pcre_NOTEMPTY:       (# exit 1024 #);

  pcre_NONBETAOPTIONS: (# exit 65535 #);

  (* Only in BETA library version *)
  (* Use non-localised English char classes *)
  (* You have to set this when compiling the regexp, not when matching *)
  pcre_C_LOCALE:       (# exit 65536 #);
  (* Study the regular expression after compiling it *)
  (* You have to set this when compiling the regexp, not when matching *)
  pcre_DO_STUDY:       (# exit 131072 #);
  (* Give none instead of zero length strings for cases where there is no
   * match.  This is more correct, but you have to program more carefully
   * to avoid runtime errors.
   *)
  pcre_RETURN_NONE:        (# exit 262144 #);

  pcre_MATCHOPTIONS:       (# exit pcre_NOTBOL %Bor
                                   pcre_NOTEOL %Bor
                                   pcre_NOTEMPTY %Bor
                                   pcre_RETURN_NONE #);

  (* For internal use *)
  pcre_INFO_OPTIONS:      (# exit  0 #);
  pcre_INFO_SIZE:         (# exit  1 #);
  pcre_INFO_CAPTURECOUNT: (# exit  2 #);
  pcre_INFO_BACKREFMAX:   (# exit  3 #);
  pcre_INFO_FIRSTCHAR:    (# exit  4 #);
  pcre_INFO_FIRSTTABLE:   (# exit  5 #);
  pcre_INFO_LASTLITERAL:  (# exit  6 #);

  pcre_ERROR_NOMATCH:     (# exit  -1 #);
  pcre_ERROR_NULL:        (# exit  -2 #);
  pcre_ERROR_BADOPTION:   (# exit  -3 #);
  pcre_ERROR_BADMAGIC:    (# exit  -4 #);
  pcre_ERROR_UNKNOWN_NODE:(# exit  -5 #);
  pcre_ERROR_NOMEMORY:    (# exit  -6 #);
  pcre_ERROR_NOSUBSTRING: (# exit  -7 #);

  (* Private internal state *)
  private: @...;

  (* Read-only for users of pcre.  Tells you how many subpatterns your
   * pattern has.  Only useful if you are reading regular expressions from
   * a config file or from the user, since otherwise you should know this
   * figure already :-]
   *)
  subPatterns: @Integer;

  (* Specialise this in order to give options when compiling the
   * regular expression and default options when matching.
   *)
  options:<(#
     (* Options:  See above *)
     CASELESS:       (# do value %Bor 1 -> value #);
     MULTILINE:      (# do value %Bor 2 -> value #);
     DOTALL:         (# do value %Bor 4 -> value #);
     EXTENDED:       (# do value %Bor 8 -> value #);
     ANCHORED:       (# do value %Bor 16 -> value #);
     DOLLAR_ENDONLY: (# do value %Bor 32 -> value #);
     EXTRA:          (# do value %Bor 64 -> value #);
     NOTBOL:         (# do value %Bor 128 -> value #);
     NOTEOL:         (# do value %Bor 256 -> value #);
     UNGREEDY:       (# do value %Bor 512 -> value #);
     NOTEMPTY:       (# do value %Bor 1024 -> value #);
     C_LOCALE:       (# do value %Bor 65536 -> value #);
     DO_STUDY:       (# do value %Bor 131072 -> value #);
     RETURN_NONE:    (# do value %Bor 262144 -> value #);
     clearCASELESS:       (# do value %Band (%Bnot 1) -> value #);
     clearMULTILINE:      (# do value %Band (%Bnot 2) -> value #);
     clearDOTALL:         (# do value %Band (%Bnot 4) -> value #);
     clearEXTENDED:       (# do value %Band (%Bnot 8) -> value #);
     clearANCHORED:       (# do value %Band (%Bnot 16) -> value #);
     clearDOLLAR_ENDONLY: (# do value %Band (%Bnot 32) -> value #);
     clearEXTRA:          (# do value %Band (%Bnot 64) -> value #);
     clearNOTBOL:         (# do value %Band (%Bnot 128) -> value #);
     clearNOTEOL:         (# do value %Band (%Bnot 256) -> value #);
     clearUNGREEDY:       (# do value %Band (%Bnot 512) -> value #);
     clearNOTEMPTY:       (# do value %Band (%Bnot 1024) -> value #);
     clearC_LOCALE:       (# do value %Band (%Bnot 65536) -> value #);
     clearDO_STUDY:       (# do value %Band (%Bnot 131072) -> value #);
     clearRETURN_NONE:    (# do value %Band (%Bnot 262144) -> value #);

     value: @Integer;
  do 0 -> value;
     INNER;
  exit value
  #);

  init:
  (#
     error: ^CString;
     exp: ^Text;
     opt: @Integer;
     errtext: @Integer;
     erroffset: @Integer;
  enter exp[]
  ...
  #);

  match:
  (#
     result: @Integer;
     subMatchCounter: @Integer;
     nextSubMatchIndex:
     (#
     do subMatchCounter = subMatchCounter + 1;
     exit subMatchCounter
     #);

     (* Get (as an integer pair) the position of the text that matched
      * the regular expression in the original text.
      *)
     matchPos:
     (#
        start: @Integer;
        end: @Integer;
     ...
     exit (start, end)
     #);

     (* Get (as a text reference) the text that matched the regular
      * expression.
      *)
     matchText:
     (#
        result: ^Text;
     do
        matchPos -> subject.sub -> result[];
     exit result[]
     #);

     (* Get (as a text reference) the text before the text that matched
      * the regular expression.
      *)
     preMatchText:
     (#
        result: ^Text;
     ...
     exit result[]
     #);

     (* Get (as a text reference) the text after the text that matched
      * the regular expression.
      *)
     postMatchText:
     (#
        result: ^Text;
     ...
     exit result[]
     #);

     (* Get (as an integer pair) the position of the nth submatch in the
      * original text.  You get (0,0) if the nth subpattern didn't match.
      * (It is possible that the nth subpattern didn't match, even if
      * the pattern as a whole matched.  This is different from the
      * subpattern matching an empty string.)
      *)
     subMatchPos:
     (#
        index: @Integer;
        start: @Integer;
        end: @Integer;
     enter index
     ...
     exit (start, end)
     #);

     (* Get (as an integer pair) the position of the next submatch in the
      * original text.  You get (0,0) if the next subpattern didn't match.
      * (It is possible that the nth subpattern didn't match, even if
      * the pattern as a whole matched.  This is different from the
      * subpattern matching an empty string.)
      *)
     nextSubMatchPos:
     (#
     exit nextSubMatchIndex -> subMatchPos
     #);

     (* Get (as a text reference) the position of the nth submatch in the
      * original text.  You get NONE if the nth subpattern didn't match and
      * you set the option.
      * (It is possible that the nth subpattern didn't match, even if
      * the pattern as a whole matched.  This is different from the
      * subpattern matching an empty string.)
      *)
     subMatchText:
     (#
        index: @Integer;
        start: @Integer;
        end: @Integer;
        result: ^Text;
     enter index
     ...
     exit result[]
     #);

     (* Get (as a text reference) the position of the next submatch in the
      * original text.  You get NONE if the next subpattern didn't match
      * and you set the option.
      * (It is possible that the nth subpattern didn't match, even if
      * the pattern as a whole matched.  This is different from the
      * subpattern matching an empty string.)
      *)
     nextSubMatchText:
     (#
     exit nextSubMatchIndex -> subMatchText
     #);

     (*
      * Shorthand methods to get a given matched subpattern 
      * You get NONE if the given subpattern didn't match and you set the
      * option.
      * (It is possible that the subpattern didn't match, even if
      * the pattern as a whole matched.  This is different from the
      * subpattern matching an empty string.)
      *)
     sub1: (# exit 1 -> subMatchText #);
     sub2: (# exit 2 -> subMatchText #);
     sub3: (# exit 3 -> subMatchText #);
     sub4: (# exit 4 -> subMatchText #);
     sub5: (# exit 5 -> subMatchText #);
     sub6: (# exit 6 -> subMatchText #);
     sub7: (# exit 7 -> subMatchText #);
     sub8: (# exit 8 -> subMatchText #);
     sub9: (# exit 9 -> subMatchText #);

     (* Gets called if there is no match at all.  I'm sure you can think
      * of something useful to put here.
      *)
     noMatch:<
     (# do INNER; #);

     (* Specialise this in order to start at a position other than the
      * start of the string
      *)
     position:<
     (#
        value: @Integer;
     do 1 -> value;
        INNER;
     exit value
     #);

     (* Specialise this in order to give options when executing the
      * regular expression.  Doesn't work for options used to compile
      * the regular expression, you had to give them earlier.  If you
      * don't specialise this then you get the global options for this
      * pcre object.
      *)
     options:<
     (#
        (* Options:  See above
         * Only the options that are useful at match-time (as opposed to
         * init-time) are here
         *)
        NOTBOL:         (# do value %Bor 128 -> value #);
        NOTEOL:         (# do value %Bor 256 -> value #);
        NOTEMPTY:       (# do value %Bor 1024 -> value #);
        RETURN_NONE:    (# do value %Bor 262144 -> value #);
        clearNOTBOL:    (# do value %Band (%Bnot 128) -> value #);
        clearNOTEOL:    (# do value %Band (%Bnot 256) -> value #);
        clearNOTEMPTY:  (# do value %Band (%Bnot 1024) -> value #);
        clearRETURN_NONE:(# do value %Band (%Bnot 262144) -> value #);

        value: @Integer;
     do THIS(pcre).options %Band pcre_MATCHOPTIONS -> value;
        INNER;
     exit value
     #);

     (* Called before the first match is attempted
      *)
     pre:<
     (# do INNER; #);

  (* match
   * Enter a text reference into the regular expression.  Returns true or
   * false according to whether the text matched the expression.  Executes
   * INNER if there is a match.
   *)
     subject: ^Text;
     matched: @Boolean;
     opt: @Integer;
     psn: @Integer;
  enter subject[]
  ...
  exit (matched)
  #);

  (*
   * matchAll: match
   * Keeps matching as many times as possible until there are no more matches
   * or the end of the string is reached.  Returns true if at least one match
   * occurs.  Calls INNER for each match.
   *)
  matchAll: match
  (#
     privatema: @...;

     (* The number og matches we have so far.  This can be queried in split
      * (where it is always one less than the number of matches except the
      * last time) or in the INNER part of matchAll, where it is accurate.
      *)
     matches: @Integer;

     pre::<
     (#
     ...
     #);

     (* Get (as a text reference) the text after the previous match (if any)
      * but before the text that matched the regular expression this time
      * around.
      *)
     splitText:
     (#
        result: ^Text;
     do
        splitPos -> subject.sub -> result[];
     exit result[]
     #);

     (* Get (as an integer pair) the position of the text after the previous
      * match (if any) but before the text that matched the regular
      * expression this time around.
      *)
     splitPos:
     (#
        start: @Integer;
        end: @Integer;
     ...
     exit (start, end)
     #);

     (* Gets called once for each split and once at the end.  You can call
      * splitText and splitPos from here to do something with the split
      * strings.  Gets called only once if the pattern doesn't match at all.
      *)
     split:<
     (#
        thismatch: @Integer;
     ...
     #);

     (* Make sure split and post get called at least once even if there
      * is no match at all.  You can add code here if you want to do
      * something whenever there is no match at all.
      *)
     noMatch::<
     (#
     ...
     #);

     (* Gets called once at the end.  You can call
      * splitText and splitPos from here to do something with the rest
      * You can also call spn, sp1, sp2, etc to get the first,
      * second etc. split text.  Restn, rest1, rest2 are similar, but they
      * get the rest of the string from the start of the nth split text to
      * the end.
      *)
     post:<
     (#
        spn:<
        (#
           num: @Integer;
           result: ^Text;
        enter num
        ...
        exit result[]
        #);
        sp1: (# exit 1-> spn #);
        sp2: (# exit 2-> spn #);
        sp3: (# exit 3-> spn #);
        sp4: (# exit 4-> spn #);
        sp5: (# exit 5-> spn #);
        sp6: (# exit 6-> spn #);
        sp7: (# exit 7-> spn #);
        sp8: (# exit 8-> spn #);
        sp9: (# exit 9-> spn #);
        restn:<
        (#
           num: @Integer;
           result: ^Text;
        enter num
        ...
        exit result[]
        #);
        rest1: (# exit 1-> restn #);
        rest2: (# exit 2-> restn #);
        rest3: (# exit 3-> restn #);
        rest4: (# exit 4-> restn #);
        rest5: (# exit 5-> restn #);
        rest6: (# exit 6-> restn #);
        rest7: (# exit 7-> restn #);
        rest8: (# exit 8-> restn #);
        rest9: (# exit 9-> restn #);
        ways2: (# exit (sp1, rest2) #);
        ways3: (# exit (sp1, sp2, rest3) #);
        ways4: (# exit (sp1, sp2, sp3, rest4) #);
        ways5: (# exit (sp1, sp2, sp3, sp4, rest5) #);
        ways6: (# exit (sp1, sp2, sp3, sp4, sp5, rest6) #);
        ways7: (# exit (sp1, sp2, sp3, sp4, sp5, sp6, rest7) #);
        ways8: (# exit (sp1, sp2, sp3, sp4, sp5, sp6, sp7, rest8) #);
        ways9: (# exit (sp1, sp2, sp3, sp4, sp5, sp6, sp7, sp8, rest9) #);
     do INNER;
     #);

  ...
  #);

  (*
   * Replace: match
   * Enter a reference to a text and a replacement string.  Exits a sucess
   * boolean and a text reference to the new string.  If there is no match
   * then false, plus a reference to a copy of the original string is exited.
   *)
  replace: match
  (#
     (* By overriding this you can put a different value in replacement,
      * so that the replacement text can be calculated dynamically (based
      * on eg. the contents or position of the matched or submatched texts).
      * (You can call matchText to get the text that matched)
      *)
     rep:<
     (#
        value: ^Text;
     enter value[]
     do INNER;
     exit value[]
     #);
     replacement: ^Text;
     new: ^Text;
  enter replacement[]
  ...
  exit
     (#
     do
        (if new[] = NONE then subject.copy -> new[] if);
     exit new[]
     #)
  #);

  (*
   * ReplaceAll: matchAll
   * Enter a reference to a text and a replacement string.  Exits a sucess
   * boolean and a text reference to the new string.  If there is no match
   * then false, plus a reference to a copy of the original string is exited. 
   *)
  replaceAll: matchAll
  (#
     (* By overriding this you can put a different value in replacement,
      * so that the replacement text can be calculated dynamically (based
      * on eg. the contents or position of the matched or submatched texts).
      * (You can call matchText to get the text that matched)
      *)
     rep:<
     (#
        value: ^Text;
     enter value[]
     do INNER;
     exit value[]
     #);

     post::<
     (#
     do splitText -> new.append;
        INNER;
     #);

     replacement: ^Text;
     new: ^Text;
  enter replacement[]
  do
     (if new[] = NONE then
        splitText -> new[];
        subject.lgth -> new.extend;
     else
        splitText -> new.append;
     if);
     replacement[] -> rep -> new.append;
     INNER;
  exit
     (#
     do
        (if new[] = NONE then subject.copy -> new[] if);
     exit new[]
     #)
  #);

(* Pcre itself enters init (which takes a text reference and compiles it
 * to a regular expression) and exits a reference to itself, which lets you
 * dynamically create a regexp and call a method on it in one line
 *)
enter init
exit this(Pcre)[]
#)


13.15 Pcre Interface
© 1990-2002 Mjølner Informatics
[Modified: Wednesday January 10th 2001 at 16:28]