regexp.sa
Generated by gen_html_sa_files from ICSI. Contact gomes@icsi.berkeley.edu for details
---------------------------> Sather 1.1 source file <--------------------------
-- regexp.sa: Object oriented regular expression frontend.
-- Author: Holger Klawitter <holger@icsi.berkeley.edu>
-- Copyright (C) 1996, International Computer Science Institute
-- $Id: regexp.sa,v 1.1 1996/06/28 03:02:41 holger Exp $
-- COPYRIGHT NOTICE: This code is provided WITHOUT ANY WARRANTY
-- and is subject to the terms of the SATHER LIBRARY GENERAL PUBLIC
-- LICENSE contained in the file: Sather/Doc/License of the
-- Sather distribution. The license is also available from ICSI,
-- 1947 Center St., Suite 600, Berkeley CA 94704, USA.
class REGEXP
class REGEXP --< $FINALIZE
-- Sather frontend for regular expressions.
-- As specified in POSIX draft 1003.2/D11.2.
is
------------ Creation of atomic regular expressions:
digit: SAME is return new.init("[0-9]") end;
-- Returns a regular expression matching '0', '1' .. or '9'.
letter: SAME is return new.init("[a-zA-Z]") end;
-- Returns a regular expression matching 'a' .. 'z' or 'A' .. 'Z'
uppercase: SAME is return new.init("[A-Z]") end;
-- Returns a regular expression matching 'A', 'B' .. or 'Z'
lowercase: SAME is return new.init("[a-z]") end;
-- Returns a regular expression matching 'a', 'b' .. or 'z'
whitespace: SAME is return new.init("[ \t\v\n]"); end;
-- Returns a regular expression matching ' ','\t','\v' and '\n'
beginning: SAME is return new.init("^"); end;
-- Returns a regular expression matching the beginning of the string.
ending: SAME is return new.init("$"); end;
-- Returns a regular expression matching the end of a string.
char: SAME is return new.init("."); end;
-- Returns a regular expression matching one arbitrary char.
str(s:STR): SAME
-- Returns a regular expression matching exactly the string 's'
is
return new.init(escape(s))
end;
oneof(s:STR): SAME
-- Returns a regular expression matching one of the chars in 's'.
is
s := normalize(s);
if s.length=1 then return new.init(escape(s)); end;
return new.init("["+s+"]");
end;
noneof(s:STR): SAME
-- Returns a regular expression matching none of the chars in 's'.
is
return new.init("[^"+normalize(s)+"]");
end;
------------ Combination of regular expressions
pow( r:SAME ): SAME
-- Returns a regular expression machting self and then r.
-- Ignores the case when one of the subexpressions ignores the case.
is
return new.init(c_str+r.c_str,ignore_case or r.ignore_case);
end;
plus( r:SAME ): SAME
-- Returns a regular expression matching self or r.
-- Ignores the case when one of the subexpressions ignores the case.
is
return new.init
("("+c_str+"|"+r.c_str+')', ignore_case or r.ignore_case );
end;
nocase: SAME
-- Returns a regular expression as self but ignoring the case
is
return new.init(c_str,true);
end;
usecase: SAME
-- Returns a regular expression as self but respecing case.
is
return new.init(c_str,false);
end;
any_times: SAME
-- Returns a regular expression matching self in zero or more occurences.
is
return new.init("("+c_str+")*",ignore_case);
end;
atleast(i:INT): SAME
-- Returns a regular expression matching self in at least i occurences.
pre i>=0
is
return new.init("("+c_str+"){"+i+",}",ignore_case);
end;
atmost(i:INT): SAME
-- Returns a regular expression matching self in zero or at most
-- 'i' occurences.
pre i>=1
is
return new.init("("+c_str+"){0,"+i+"}",ignore_case);
end;
between(i,j:INT): SAME
-- Returns a regular expression matching self in between 'i' and 'j'
-- occurences.
pre j>=i and i>=0
is
return new.init("("+c_str+"){"+i+","+j+"}",ignore_case);
end;
regexp(s:STR,ignorecase:BOOL): SAME
-- Passing a regular expression directly from string 's' using the
-- POSIX syntax. (Special meanings are carried by unescaped characters.)
-- Returns void if the expression is illegal.
is
res: SAME := new.init(s,ignorecase);
res.compile;
if void(res.c_regexp) then return void end;
return res;
end;
------------ Using regular expressions
match( s:STR ): BOOL
-- Peturns true if the string 's' is being accepted by the regular
-- expression
is
b,e: INT;
match(s,out b,out e);
return b>=0;
end;
match( s:STR, out from: INT, out to: INT )
-- Peturns the first position of 'string' matching the regular and the
-- position of the first character after the match.
-- 'from' and 'to' are -1 when no match is found.
is
cstr,outfrom,outto: EXT_OB;
if void(c_regexp) then compile end;
assert ~void(c_regexp);
SYS::inlined_C("#cstr=(void*)s->arr_part;");
SYS::inlined_C("outfrom=(void*)from;");
SYS::inlined_C("outto=(void*)to;");
C_REGEXP::C_REGEXP_match(c_regexp,cstr,outfrom,outto);
-- This does the same:
--creg := c_regexp;
--SYS::inlined_C("C_REGEXP_match(#creg,#s->arr_part,from,to);");
end;
compile
-- Preparation of a regular expression to be used in 'match'.
-- If not called before, will be called automatically on the first call
-- to 'match'.
is
if ~void(c_regexp) then return end;
s: STR := c_str;
ic: BOOL := ignore_case;
cstr,ccase: EXT_OB;
SYS::inlined_C("#cstr=(void*)#s->arr_part");
SYS::inlined_C("#ccase=(void*)(int)#ic");
c_regexp := C_REGEXP::C_REGEXP_compile(cstr,ccase);
end;
------------ Internal routines
readonly attr c_str: STR;
-- Regular expression in POSIX syntax.
readonly attr ignore_case: BOOL;
-- True, if regular expression does not respect the case.
private attr c_regexp: EXT_OB;
-- Compiled version of the regular expression.
private finalize
-- The internal buffers of the POSIX regexp may keep some of the
-- buffers reachable, meaning unreachable for GC.
post void(c_regexp)
is
creg: EXT_OB;
if void(c_regexp) then return; end;
creg := c_regexp;
C_REGEXP::C_REGEXP_free(c_regexp);
c_regexp := void;
end;
private init(s:STR): SAME is return init(s,false) end;
private init(s:STR,ignorecase:BOOL): SAME
-- Internal creation routine.
is
c_regexp := void;
ignore_case := ignorecase;
c_str := s;
return self;
end;
private normalize(s:STR): STR
-- Returns a normalized version of 's'. Characters with special meaning
-- in bracket expressions are rearranged to keep literal meaning.
is
if s.contains('^') then s := s.remove('^')+'^'; end;
if s.contains('-') then s := s.remove('-')+'-'; end;
if s.contains(']') then s := "]"+s.remove(']') end;
return s;
end;
private escape(s:STR): STR
-- Returns a string in which all characters with special meaning
-- are escaped to keep their literal meaning.
is
res: STR;
loop
c: CHAR := s.elt!;
if "\\[]*.^$+(){}".contains(c) then
res := res + '\\' + c;
else
res := res + c;
end;
end;
return res;
end;
end; -- class REGEXP
external class C_REGEXP
external class C_REGEXP
is
C_REGEXP_compile(string:EXT_OB,ignorecase:EXT_OB):EXT_OB;
-- Gets a string and returns a compiled regular expression or void
-- if the expression had an error.
C_REGEXP_match(regexp,string,beg,ending:EXT_OB);
-- Tries to match the string agains the precompiled regular expression
-- returns the beginning and ending position of the string as (int*)
-- or (-1,-1) if no match found.
C_REGEXP_free(regexp:EXT_OB);
-- Removes a precompiled regular expression from memory. Called from
-- finalize as the regexp library might keep internal pointer otherwise.
end; -- external class C_REGEXP
class TEST_REGEXP
class TEST_REGEXP
is
include TEST;
main
is
r1,r2,r3: REGEXP;
class_name("REGEXP");
r1 := REGEXP::beginning ^
(REGEXP::oneof("^{}[]()")
^ REGEXP::str("{}[]()")
^ REGEXP::noneof("^-a")
+ REGEXP::str("banana")
);
r2 := REGEXP::beginning
^ (REGEXP::str("foo") + REGEXP::str("bar"))
^ REGEXP::ending;
r3 := REGEXP::regexp("((",false);
test( "POSIX syntax selection", void(r2).str, "false" );
test( "POSIX syntax selection", void(r3).str, "true" );
test( "'^' handling", r1.match("^{}[]()b").str, "true" );
test( "'[]' handling", r1.match("{{}[]()x").str, "true" );
test( "'[^]' handling", r1.match("^{}[]()a").str, "false" );
test( "'-' handling", r1.match("^{}[]()-").str, "false" );
test( "'|' precedence", r1.match("banana").str, "true" );
test( "beginning", r2.match("fobar"), "false" );
test( "ending", r2.match("fooar"), "false" );
test( "'|' handling", r2.match("foo"), "true" );
test( "'|' handling", r2.match("bar"), "true" );
r3 := REGEXP::str("a")^REGEXP::str("b").atmost(3)^REGEXP::str("c");
b,e: INT;
r3.match("xabbcz",out b,out e);test("match", b.str+","+e.str, "1,5");
r3.match("xabbz",out b,out e);test("match", b.str+","+e.str, "-1,-1");
test( "atmost", r3.match("abbc"), "true" );
test( "atmost", r3.match("abbbc"), "true" );
test( "atmost", r3.match("abbbbc"), "false" );
r3 := REGEXP::str("a")^REGEXP::str("b").between(2,3)^REGEXP::str("c");
test( "between", r3.match("abc"), "false" );
test( "between", r3.match("abbbc"), "true" );
test( "between", r3.match("abbbbc"), "false" );
r3 := REGEXP::str("a")^REGEXP::str("b").atleast(2)^REGEXP::str("c");
test( "atleast", r3.match("abc"), "false" );
test( "atleast", r3.match("abbc"), "true" );
test( "atleast", r3.match("abbbc"), "true" );
r3 := REGEXP::str("ananas");
test( "right case", r3.match("ananas"), "true" );
test( "wrong case", r3.match("Ananas"), "false" );
test( "wrong case", r3.match("ananaS"), "false" );
r3 := r3.nocase;
-- If the following four tests fail, the REG_ICASE in regexp.c
-- probably has to be set for the current architecture.
test( "ignorecase", r3.match("Ananas"), "true" );
test( "ignorecase", r3.match("ananaS"), "true" );
r3 := r3.usecase;
test( "usecase", r3.match("Ananas"), "false" );
test( "usecase", r3.match("ananas"), "true" );
finish;
end; -- main
end;