; Copyright 2009 Uwe Hollerbach ; $Id: regexp.scm,v 1.8 2009-06-20 03:10:27 uwe Exp $ ; BSD3... but if you use this for anything serious, you gotta be kidding ; grammar for regular expressions: precedence is (highest to lowest) ; counting operators: *+? TODO: add count {N} or range {LO,HI} ; -> convert current ops to ; ('count lo hi ...) kind of thing ; concatenation ; alternation RE1 | RE2 ; parentheses force grouping ; TODO: add negated ranges [^a-z], more-complex character ranges [a-fq-z] etc ; ; regexp = re1 ; | re1 '|' regexp ; ; re1 = re2 ; | re2 re1 ; ; re2 = re3 ; | re3 '?' ; | re3 '*' ; | re3 '+' ; ; re3 = character ; | escaped-character ; | character-range (only simple ranges [x-y]) ; | '(' regexp ')' ; ; NOTE!!! escaped characters are '\(' for example; but the scheme reader ; already processes '\', so if entering from keyboard or some literal ; string, need to double-escape it: (regexp-parse (lexer "(a|\\()*")) ; Parse a complete regular expression, checking that there is nothing left ; at the end. This is kind of a large function... but I don't really want to ; expose the individual levels, they don't make much sense by themselves. ; This returns an AST: a literal character or a list of lists describing ; sub-regexps. The car of each list is a description of the type of regexp: ; 'CONCAT, 'ALT, or 'COUNT. If the type is 'COUNT, then the cadr of the list ; descibes the count range: currently one of #\? #\* #\+. Remaining entries ; in all the lists are again ASTs. (define (regexp-parse str) (letrec* ((tokens (filter (lambda (c) (not (char-whitespace? c))) (string->char str))) (cur-tok #f) (peek (lambda () (if (null? tokens) #f (car tokens)))) (pop (lambda () (if (null? tokens) #f (begin (set! cur-tok (car tokens)) (set! tokens (cdr tokens)) cur-tok)))) ; handle choice operator '|' (p-re0 (lambda () (let* ((re1 (p-re1)) (cur (peek))) (if (eqv? cur #\|) (begin (pop) (list 'ALT re1 (p-re0))) re1)))) ; handle special kind of choice: character range; for now, only handle ; [a-b] type (except that the brackets are done outside these routines) (gen-range (lambda (cs) (if (null? (cdr cs)) (car cs) (list 'ALT (car cs) (gen-range (cdr cs)))))) (p-range (lambda () (let* ((c1 (char->integer (pop))) (cminus (pop)) (c2 (char->integer (pop))) (lo (min c1 c2)) (hi (max c1 c2)) (n (+ 1 (- hi lo)))) (if (eqv? cminus #\-) (gen-range (map integer->char (upfrom lo n))) (raise "bad character range"))))) ; handle concatenation "operator" (p-re1 (lambda () (let* ((re2 (p-re2)) (cur (peek))) (if (or (eqv? cur #f) (eqv? cur #\|) (eqv? cur #\))) re2 (list 'CONCAT re2 (p-re1)))))) ; handle count operators '?', '*', and '+' (p-re2 (lambda () (let* ((re3 (p-re3)) (cur (peek))) (if (or (eqv? cur #\?) (eqv? cur #\*) (eqv? cur #\+)) (list 'COUNT (pop) re3) re3)))) ; handle individual characters and parenthesized regexps (p-re3 (lambda () (let ((cur (peek))) (cond ((eqv? cur #\() (pop) (set! cur (p-re0)) (if (eqv? (peek) #\)) (begin (pop) cur) (raise "error: unbalanced parentheses"))) ((eqv? cur #\[) (pop) (set! cur (p-range)) (if (eqv? (peek) #\]) (begin (pop) cur) (raise "error: unbalanced brackets"))) ((eqv? cur #\\) (pop) (pop)) ((or (eqv? cur #\|) (eqv? cur #\?) (eqv? cur #\*) (eqv? cur #\+)) (raise "error: unexpected operator")) (else (pop)))))) (regexp (p-re0))) (if (eqv? (peek) #f) regexp (raise "error: input not completely used")))) ; Flatten the AST produced by regexp-parse: instead of a pure tree of ; binary ops, CONCAT and ALT get turned into multi-operand operators ; where applicable. For example ; (CONCAT #\a (CONCAT #\b (CONCAT #\c (CONCAT #\d #\e)))) ; gets turned into ; (CONCAT #\a #\b #\c #\d #\e) ; This will make the NFA have a lot fewer intermediate states. ; TODO: add more-efficient alternation for a range of characters: instead of ; building a huge number of states and gluing them all together, just do two ; states and allow transitions on multiple characters. Simple enough, should ; significantly reduce the number of states in the NFA (define (regexp-flatten-ast ast) (let ((s-op (lambda (op t1 t2) (let ((lst (list op)) (lifter (lambda (l) ((if (and (list? l) (eqv? (car l) op)) cdr list) l)))) (append (append lst (lifter t1)) (lifter t2)))))) (if (list? ast) (cond ((eqv? 'COUNT (car ast)) (list (car ast) (cadr ast) (regexp-flatten-ast (caddr ast)))) ((or (eqv? 'CONCAT (car ast)) (eqv? 'ALT (car ast))) (s-op (car ast) (regexp-flatten-ast (cadr ast)) (regexp-flatten-ast (caddr ast)))) (else ast)) ast))) ; This generates a list of states describing an NFA, given an AST generated ; from the above routine(s). The first state in the list will be the start ; state, and the second state in the list will be the end state; intermediate ; states will follow after that. The initial version produces copious ; intermediate states with epsilon-transitions. (define (regexp-make-nfa ast) (letrec* ((con3 (lambda (a b c) (cons a (cons b c)))) ; Make a new state (st-count 0) (mk-st (lambda () (set! st-count (+ 1 st-count)) (list st-count '()))) ; Add a transition from state ST1 to state ST2 on character C ; or an eps-transition if c is the empty list (add-tr (lambda (st1 st2 c) (cons (car st1) (append (if (null? c) (list (cons (car st2) (cadr st1))) (list (cadr st1) (list (car st2) c))) (cddr st1))))) ; Add an eps-transition from state ST to each of the states in the list STS (add-h-eps (lambda (st sts) (if (null? sts) st (add-h-eps (add-tr st (car sts) '()) (cdr sts))))) ; Add an eps-transition from the tail state of NFA to state ST (add-t-eps (lambda (nfa st) (con3 (car nfa) (add-tr (cadr nfa) st '()) (cddr nfa)))) ; Merge two NFAs into one which represents their concatenation (merge-nfas (lambda (nfa1 nfa2) (cond ((null? nfa1) nfa2) ((null? nfa2) nfa1) (else (let ((h2 (car nfa2))) (con3 (car nfa1) (cadr nfa2) (append (list (add-tr (cadr nfa1) h2 '()) h2) (cddr nfa1) (cddr nfa2)))))))) ; Main routine to walk the AST and translate into an NFA (gen-nfa (lambda (ast) (let ((st #f) (cop #f) (sub #f)) (cond ((char? ast) (set! st (mk-st)) (list (add-tr (mk-st) st ast) st)) ((eqv? 'COUNT (car ast)) (set! cop (cadr ast)) (set! sub (gen-nfa (caddr ast))) (cond ((eqv? #\? cop) (cons (add-tr (car sub) (cadr sub) '()) (cdr sub))) ((eqv? #\+ cop) (con3 (car sub) (add-tr (cadr sub) (car sub) '()) (cddr sub))) ((eqv? #\* cop) (con3 (add-tr (car sub) (cadr sub) '()) (add-tr (cadr sub) (car sub) '()) (cddr sub))) (else (raise "unknown COUNT op!")))) ((eqv? 'CONCAT (car ast)) (foldl merge-nfas '() (map gen-nfa (cdr ast)))) ((eqv? 'ALT (car ast)) (set! st (mk-st)) (set! sub (map gen-nfa (cdr ast))) (con3 (add-h-eps (mk-st) (map car sub)) st (apply append (map (lambda (a) (add-t-eps a st)) sub)))) (else (raise "unknown regexp op!"))))))) (gen-nfa ast))) ; This is a lot simpler using a vector than it would be with pure lists... ; it would be possible, but the vector gives us the equivalent of set-car! ; and that makes a huge difference. With bitsets, it's even not unreasonably ; slow. (define (regexp-gen-eps-closure nfa) (letrec* ((max-state (lambda (nfa m) (if (null? nfa) m (max-state (cdr nfa) (max m (caar nfa)))))) (n (+ 1 (max-state nfa 1))) (vec (make-vector n)) (cur #f) (new #f) (change #t)) (map (lambda (s) (vector-set! vec (car s) (foldl bitset-add (bitset-new) (cadr s)))) nfa) (while change (set! change #f) (do ((i 1 (+ i 1))) ((>= i n) #t) (set! cur (vector-ref vec i)) (set! new cur) (bitset-foreach cur (lambda (j) (set! new (bitset-or new (vector-ref vec j))))) (unless (bitset-equal? new cur) (set! change #t) (vector-set! vec i new)))) (map (lambda (s) (let ((ec (vector-ref vec (car s)))) (set! ec (bitset-add ec (car s))) (cons (car s) (cons ec (cddr s))))) nfa))) ; NFA walker ; This generates the epsilon-union of a state set SET from the NFA SS (define (find-eps-union set ss) (letrec* ((eu (bitset-new)) (find-eps-closure (lambda (s ss) (cond ((null? ss) (raise "programming error! state not in nfa!")) ((= s (caar ss)) (cadar ss)) (else (find-eps-closure s (cdr ss)))))) (merge-eps-closure (lambda (s) (set! eu (bitset-or eu (find-eps-closure s ss)))))) (bitset-foreach set merge-eps-closure) eu)) ; This makes a single transition from any one of the input states STS, ; given the character CH (define (make-transition nfa sts ch) (letrec ((find-transitions ; look up the real transitions (lambda (s ss) ; of the given state S in the NFA SS (cond ((null? ss) (raise "programming error! state not in nfa!")) ((= s (caar ss)) (cddar ss)) (else (find-transitions s (cdr ss)))))) ; find all the new states we could be in, given the transitions list TS ; and the current character CH (given in the surrounding scope) (process-transitions (lambda (acc ts) (cond ((null? ts) acc) ((memv ch (cdar ts)) (process-transitions (bitset-add acc (caar ts)) (cdr ts))) (else (process-transitions acc (cdr ts)))))) ; find the union of all the epsilon-closures of the input states (eps-union (find-eps-union sts nfa)) (new-sts (bitset-new))) ; make all possible transitions to new states from this eps-union (bitset-foreach eps-union (lambda (s) (set! new-sts (bitset-or new-sts (process-transitions (bitset-new) (find-transitions s nfa)))))) new-sts)) ; Make a series of transitions ; TODO: for things like numbers in scientific notation, where there's ; an optional trailing part, this doesn't work so well: if we enter the ; string "-0.475e" which is a malformed number in scientific notation, ; but it has a well-formed prefix "-0.475", we'd like to return that as ; a match ---> need to keep track of whether intermediate states are ; matching, and return the last matching intermediate state etc. (define (make-transitions nfa init-state match-states str) (letrec* ((chars (string->char str)) (doit (lambda (sts chs acc) (if (null? chs) (list (reverse acc) sts) (let ((new-st (make-transition nfa sts (car chs)))) (if (bitset-empty? new-st) (list (reverse acc) sts) (doit new-st (cdr chs) (cons (car chs) acc))))))) (gen-or (lambda (acc ss) (if (null? ss) acc (gen-or (bitset-add acc (car ss)) (cdr ss))))) (result (doit (bitset-add (bitset-new) init-state) chars '())) (final-state (find-eps-union (cadr result) nfa)) (mstr (char->string (car result))) (matches #f)) (if (null? match-states) result (begin (set! matches (bitset-and (gen-or (bitset-new) match-states) final-state)) (write-string "matches is ") (show-bits matches) (newline) (if (bitset-empty? matches) (list #f matches mstr) (list #t matches mstr)))))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; Some simple regexps to try out: note that all the escaped characters ; are double-escaped: to get a literal '+' into the regexp parser, we ; need to get it (a) past the REPL's escape mechanism, then (b) escape ; it for the regexp parser. Hence all the '\\+' ; unsigned and signed integers: very simple (define unsigned-integer "[0-9]+") (define signed-integer "(\\+|-)?[0-9]+") ; an unsigned decimal number: xxx.yyy, where both xxx and yyy can ; represent any number of digits including none, except that they ; can't both be empty simultaneously (define unsigned-decimal "(([0-9]+.[0-9]*)|([0-9]*.[0-9]+))") ; same as above, with an optional sign out front (define signed-decimal (string-join-by "" "(\\+|-)?" unsigned-decimal)) ; same as above, with an optional scientific-notation trailer (define signed-scientific (string-join-by "" signed-decimal "((e|E)(\\+|-)?[0-9]+)?")) ; same in base-2, for simplicity (define unsigned-integer2 "[0-1]+") (define signed-integer2 "(\\+|-)?[0-1]+") (define unsigned-decimal2 "(([0-1]+.[0-1]*)|([0-1]*.[0-1]+))") (define signed-decimal2 (string-join-by "" "(\\+|-)?" unsigned-decimal2)) (define signed-scientific2 (string-join-by "" signed-decimal2 "((e|E)(\\+|-)?[0-1]+)?")) ; These two are for conveniently building "killer" regexps which will ; cause backtracking NFA implementations (such as in perl, python, and ; many other scripting languages) to run exponentially slowly. We do ; better than that... although the constant out front is kinda bad :-) (define (make-slow-string n) (string-join-by "" (replicate "a" n))) (define (make-slow-regexp n) (string-join-by "" (append (replicate "a?" n) (replicate "a" n)))) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; debugging stuff (define (show-bits cur) (let ((start #\<)) (if (bitset-empty? cur) (write-string start) (bitset-foreach cur (lambda (j) (write-string start) (set! start #\space) (write-string (number->string j))))) (write-string ">"))) (define (tab) (write-string #\tab)) (define (space) (write-string #\space)) (define (show-state s) (write-string "state = ") (display (car s)) (unless (null? (cadr s)) (write-string "\teps ") (show-bits (cadr s))) (unless (null? (cddr s)) (write-string " regular ") (map (lambda (x) (display x) (space)) (cddr s))) (newline)) (define (show-states s) (write-string "states = \n") (map (lambda (st) (show-state st)) s) #t)