00001
00002
00003
00004
00005
00006
00007
00008
00009
00010 package require struct::list ;
00011 package require struct:: ; # Extended = operations = .
00012
00013
00014
00015
00016 namespace ::grammar::fa::op {
00017
00018
00019
00020
00021 ret reverse (type fa) {}
00022 ret complete (type fa , optional sink ={)} {}
00023 proc remove_eps {fa} {}
00024 proc trim {fa {what !reachable|!useful}} {}
00025 ret determinize (type fa , optional mapvar ={) {idstart 0}} {}
00026 ret minimize (type fa , optional mapvar ={)} {}
00027
00028 proc complement {fa} {}
00029 proc kleene {fa} {}
00030 proc optional {fa} {}
00031 proc union {fa fb {mapvar {}}} {}
00032 ret intersect (type fa , type fb , optional mapvar ={) {idstart 0}} {}
00033 ret difference (type fa , type fb , optional mapvar ={)} {}
00034 proc concatenate {fa fb {mapvar {}}} {}
00035
00036 ret fromRegex (type fa , type regex , optional over ={)} {}
00037
00038 proc toRegexp {fa} {}
00039 proc toRegexp2 {fa} {}
00040
00041 proc simplifyRegexp {rex} {}
00042 proc toTclRegexp {rex symdict} {}
00043
00044 # ### ### ### ######### ######### #########
00045
00046 namespace export reverse complete remove_eps trim \
00047 determinize minimize complement kleene \
00048 optional union intersect difference \
00049 concatenate fromRegex toRegexp toRegexp2 \
00050 simplifyRegexp toTclRegexp
00051
00052 # ### ### ### ######### ######### #########
00053 ## Internal data structures.
00054
00055 variable cons {}
00056
00057 # ### ### ### ######### ######### #########
00058 }
00059
00060 # ### ### ### ######### ######### #########
00061 ## API implementation. Structure
00062
00063 proc ::grammar::fa::op::reverse {fa} {
00064 # Reversal means that all transitions change their direction
00065 # and start and final states are swapped.
00066
00067 # Note that reversed FA might not be deterministic, even if the FA
00068 # itself was.
00069
00070 # One loop is not enough for this. If we reverse the
00071 # transitions for a state immediately we may modify a state
00072 # which has not been processed yet. And when we come to this
00073 # state we reverse already reversed transitions, creating a
00074 # complete mess. Thus two loops, one to collect the current
00075 # transitions (and also remove them), and a second to insert
00076 # the reversed transitions.
00077
00078 set tmp [$fa finalstates]
00079 $fa final set [$fa startstates]
00080 $fa start set $tmp
00081
00082 # FUTURE : Method to retrieve all transitions
00083 # FUTURE : Method to delete all transitions
00084
00085 set trans {}
00086 foreach s [$fa states] {
00087 foreach sym [$fa symbols@ $s] {
00088 lappend trans $s $sym [$fa next $s $sym]
00089 $fa !next $s $sym
00090 }
00091 }
00092 foreach {s sym destinations} $trans {
00093 foreach d $destinations {
00094 $fa next $d $sym --> $s
00095 }
00096 }
00097 return
00098 }
00099
00100
00101
00102 ret ::grammar::fa::op::complete (type fa , optional sink ={)} {
00103 if {[$fa is complete]} return
00104
00105
00106
00107 if {$sink eq ""} {
00108 sink = [FindNewState $fa sink]
00109 } elseif {[$fa state exists $sink]} {
00110 return -code error "The chosen sink state exists already"
00111 }
00112 $fa state add $sink
00113
00114
00115
00116
00117
00118 symbols = [$fa symbols]
00119 foreach sym $symbols {
00120 $fa next $sink $sym --> $sink
00121 }
00122
00123 if {[$fa is epsilon-free]} {
00124 foreach s [$fa states] {
00125 foreach missing [struct:: difference = \
00126 $symbols \
00127 [$fa symbols@ $s]] {
00128 $fa next $s $missing --> $sink
00129 }
00130 }
00131 } else {
00132
00133
00134
00135
00136
00137
00138
00139
00140 new = {}
00141 foreach s [$fa states] {
00142 foreach missing [struct:: difference = \
00143 $symbols \
00144 [$fa symbols@ [$fa = epsilon_closure $s]]] {
00145 lappend new $s $missing
00146 }
00147 }
00148
00149 foreach {s missing} $new {
00150 $fa next $s $missing --> $sink
00151 }
00152 }
00153 return
00154 }
00155
00156
00157
00158 ret ::grammar::fa::op::remove_eps (type fa) {
00159 # We eliminate all epsilon transitions by duplicating a number
00160 # of regular transitions, which we get through the epsilon
00161 # closure of the states having epsilon transitions. We do
00162 # nothing if the FA is epsilon free to begin with.
00163
00164 if {[$fa is epsilon-free]} return
00165
00166 # Note: Epsilon transitions touching start and final states
00167 # propagate the start markers forward and final markers
00168 # backward. We do this first by propagating start markers twice,
00169 # once with a reversed FA. This also gives us some
00170 # epsilon-closures as well.
00171
00172 foreach n {1 2} {
00173 foreach s [$fa startstates] {
00174 foreach e [$fa epsilon_closure $s] {
00175 $fa start add $e
00176 }
00177 }
00178 reverse $fa
00179 }
00180
00181 # Now duplicate all transitions which are followed or preceeded by
00182 # epsilon transitions of any number greater than zero.
00183
00184 # Note: The closure computations done by the FA are cached in the
00185 # FA, so doing it multiple times is no big penalty.
00186
00187 # FUTURE : Retrieve all transitions on one command.
00188
00189 # FUTURE : Different algorithm ...
00190 # Retrieve non-eps transitions for all states ...
00191 # Iterate this list. Compute e-closures for endpoints, cache
00192 # them. Duplicate the transition if needed, in that case add it to
00193 # the end of the list, for possible more duplication (may touch
00194 # different e-closures). Stop when the list is empty again.
00195
00196 set changed 1
00197 while {$changed} {
00198 set changed 0
00199 foreach s [$fa states] {
00200 foreach sym [$fa symbols@ $s] {
00201 set dest [$fa next $s $sym]
00202 if {$sym eq ""} {
00203 # Epsilon transitions.
00204
00205 # Get the closure, and duplicate all transitions for all
00206 # non-empty symbols as transitions of the original state.
00207 # This may lead to parallel transitions between states, hence
00208 # the catch. It prevents the generated error from stopping the
00209 # action, and no actual parallel transitions are created.
00210
00211 set clos [$fa epsilon_closure $s]
00212 foreach csym [$fa symbols@set $clos] {
00213 if {$csym eq ""} continue
00214 foreach d [$fa nextset $clos $csym] {
00215 if {![catch {$fa next $s $csym --> $d} msg]} {
00216 set changed 1
00217 }
00218 }
00219 }
00220 } else {
00221 # Regular transition. Go through all destination
00222 # states, compute their closures and replicate the
00223 # transition if the closure contains more than the
00224 # destination itself, to all states in the closure.
00225
00226 foreach d $dest {
00227 set clos [$fa epsilon_closure $d]
00228 if {[llength $clos] > 1} {
00229 foreach e $clos {
00230 if {![catch {$fa next $s $sym --> $e}]} {
00231 set changed 1
00232 }
00233 }
00234 }
00235 }
00236 }
00237 }
00238 }
00239 }
00240
00241 # At last, drop the epsilons for all states. Only now is this
00242 # possible because otherwise we might compute bad epsilon
00243 # closures in the previous loop.
00244
00245 foreach s [$fa states] {
00246 $fa !next $s ""
00247 }
00248 return
00249 }
00250
00251
00252
00253 ret ::grammar::fa::op::trim (type fa , optional what =!reachable|!useful) {
00254 # Remove various unwanted pices from the FA.
00255
00256 switch -exact -- $what {
00257 !reachable {
00258 set remove [$fa unreachable_states]
00259 }
00260 !useful {
00261 set remove [$fa unuseful_states]
00262 }
00263 !reachable&!useful -
00264 !(reachable|useful) {
00265 set remove [struct::set intersect [$fa unreachable_states] [$fa unuseful_states]]
00266 }
00267 !reachable|!useful -
00268 !(reachable&useful) {
00269 set remove [struct::set union [$fa unreachable_states] [$fa unuseful_states]]
00270 }
00271 default {
00272 return -code error "Expected !reachable, !useful, !reachable&!useful, !(reachable|useful), !reachable|!useful, or !(reachable&useful), got \"$what\""
00273 }
00274 }
00275
00276 foreach s $remove {
00277 $fa state delete $s
00278 }
00279 return
00280 }
00281
00282
00283
00284 ret ::grammar::fa::op::determinize (type fa , optional mapvar ={) {idstart 0}} {
00285
00286
00287
00288
00289
00290
00291 hasmap = 0
00292 if {$mapvar ne ""} {
00293 upvar 1 $mapvar map ; hasmap = 1
00294 }
00295
00296
00297
00298
00299 if {[$fa is deterministic]} {
00300 if {$hasmap} { map = {}}
00301 return
00302 }
00303
00304
00305
00306
00307
00308 if {[llength [$fa startstates]]} {trim $fa !reachable}
00309 if {[llength [$fa finalstates]]} {trim $fa !useful}
00310 if {[$fa is deterministic]} {
00311 if {$hasmap} { map = {}}
00312 return
00313 }
00314
00315
00316
00317
00318
00319
00320
00321 remove_eps $fa
00322 if {[llength [$fa startstates]]} {trim $fa !reachable}
00323 if {[llength [$fa finalstates]]} {trim $fa !useful}
00324 if {[$fa is deterministic]} {
00325 if {$hasmap} { map = {}}
00326 return
00327 }
00328
00329
00330
00331
00332
00333
00334 array subsets = {}
00335 id = $idstart
00336 pending = {}
00337 dfa = [[cons] %AUTO%]
00338
00339 foreach sym [$fa symbols] {$dfa symbol add $sym}
00340
00341
00342
00343
00344
00345 starts = [$fa startstates]
00346 if {[llength $starts] > 0} {
00347
00348
00349 starts = [lsort $starts] ;
00350 $dfa state add $id
00351 $dfa start add $id
00352
00353
00354 if {[$fa final? $starts = ]} {
00355 $dfa final add $id
00356 }
00357
00358 subsets = (dfa,$starts) $id
00359 subsets = (nfa,$id) $starts
00360
00361 lappend pending $id
00362 incr id
00363 } else {
00364
00365
00366
00367
00368 foreach s [$fa states] {
00369 nfaset = [list $s]
00370
00371 $dfa state add $id
00372 if {[$fa final? $s]} {
00373 $dfa final add $id
00374 }
00375
00376 subsets = (dfa,$nfa) $id =
00377 subsets = (nfa,$id) $nfa
00378 lappend = pending $id
00379 incr id
00380 }
00381 }
00382
00383 while {[llength $pending]} {
00384 dfastate = [struct::list shift pending]
00385
00386
00387
00388 nfaset = $subs = (nfa,$dfastate)
00389
00390 foreach sym [$fa symbols@ $nfaset = ] {
00391 nfanext = [lsort [$fa next $nfaset = $sym]]
00392
00393 if {![info exists subs = (dfa,$nfanext)]} {
00394
00395
00396 $dfa state add $id
00397 if {[$fa final? $nfanext = ]} {
00398 $dfa final add $id
00399 }
00400
00401 subsets = (dfa,$nfanext) $id
00402 subsets = (nfa,$id) $nfanext
00403
00404
00405
00406
00407 lappend pending $id
00408 incr id
00409 }
00410
00411
00412 $dfa next $dfastate $sym --> $subs = (dfa,$nfanext)
00413 }
00414 }
00415
00416 if {[llength [$fa startstates]]} {trim $fa !reachable}
00417 if {[llength [$fa finalstates]]} {trim $fa !useful}
00418
00419 if {$hasmap} {
00420
00421
00422 map = {}
00423 foreach s [$dfa states] {
00424 lappend map $s $subs = (nfa,$s)
00425 }
00426 }
00427
00428 $fa = $dfa
00429 $dfa destroy
00430
00431
00432 return
00433 }
00434
00435
00436
00437 ret ::grammar::fa::op::minimize (type fa , optional mapvar ={)} {
00438 # Brzozowski's method:
00439 # Reverse, determinize, reverse again, determinize again.
00440
00441 reverse $fa
00442 determinize $fa mapa
00443 reverse $fa
00444 determinize $fa mapb
00445
00446 if {$mapvar ne ""} {
00447 upvar 1 $mapvar map
00448
00449 if {![llength $mapa] && ![llength $mapb]} {
00450
00451 map = {}
00452 } elseif {[llength $mapa] && ![llength $mapb]} {
00453
00454 map = $mapa
00455 } elseif {![llength $mapa] && [llength $mapb]} {
00456
00457 map = $mapb
00458 } else {
00459
00460
00461
00462
00463
00464 map = {}
00465 array tmp = $mapa
00466 foreach {b a} $mapb = {
00467 compose = {}
00468 foreach a $a {foreach = o $tmp($a) {lappend compose $o}}
00469 lappend map $b [lsort -uniq $compose]
00470 }
00471 }
00472 }
00473
00474
00475 return
00476 }
00477
00478
00479
00480
00481 ret ::grammar::fa::op::complement (type fa) {
00482 # Complementing is possible if and only if the FA is complete,
00483 # and accomplished by swapping the final and non-final states.
00484
00485 if {![$fa is complete]} {
00486 return -code error "Unable to complement incomplete FA"
00487 }
00488 if {![$fa is deterministic]} {
00489 return -code error "Unable to complement non-deterministic FA"
00490 }
00491
00492 set newfinal [struct::set difference [$fa states] [$fa finalstates]]
00493 $fa final set $newfinal
00494 return
00495 }
00496
00497
00498
00499 ret ::grammar::fa::op::kleene (type fa) {
00500 # The Kleene Closure of the FA makes no sense if we don't have
00501 # start and final states we can work from.
00502
00503 set start [$fa startstates]
00504 set final [$fa finalstates]
00505
00506 if {![llength $start] || ![llength $final]} {
00507 return -code error "Unable to add Kleene's closure to a FA without start/final states"
00508 }
00509
00510 # FUTURE :: If final states have no outgoing transitions, and start
00511 # FUTURE :: states have no input transitions, then place the new
00512 # FUTURE :: transitions directly between start and final
00513 # FUTURE :: states. In that case we don't need new states.
00514
00515 # We need new start/final states, like for optional (see below)
00516
00517 set ns [NewState $fa s]
00518 set nf [NewState $fa f]
00519
00520 foreach s $start {$fa next $ns "" --> $s}
00521 foreach f $final {$fa next $f "" --> $nf}
00522
00523 $fa start clear ; $fa start add $ns
00524 $fa final clear ; $fa final add $nf
00525
00526 $fa next $ns "" --> $nf ; # Optionality
00527 $fa next $nf "" --> $ns ; # Loop for closure
00528 return
00529 }
00530
00531
00532
00533 ret ::grammar::fa::op::optional (type fa) {
00534 # The Optionality of the FA makes no sense if we don't have
00535 # start and final states we can work from.
00536
00537 set start [$fa startstates]
00538 set final [$fa finalstates]
00539
00540 if {![llength $start] || ![llength $final]} {
00541 return -code error "Unable to make a FA without start/final states optional"
00542 }
00543
00544 # We have to introduce new start and final states to ensure
00545 # that we do not get additional recognized words from the FA
00546 # due to epsilon transitions. IOW just placing epsilons from
00547 # all start to all final states is wrong. Consider unreachable
00548 # final states, they become reachable. Or final states able to
00549 # reach final states from. Again the epsilons would extend the
00550 # language. We have to detach our optional epsilon from anything
00551 # in the existing start/final states. Hence the new start/final.
00552
00553 # FUTURE : Recognize if there are no problems with placing direct
00554 # FUTURE : epsilons from start to final.
00555
00556 set ns [NewState $fa s]
00557 set nf [NewState $fa f]
00558
00559 foreach s $start {$fa next $ns "" --> $s}
00560 foreach f $final {$fa next $f "" --> $nf}
00561
00562 $fa start clear ; $fa start add $ns
00563 $fa final clear ; $fa final add $nf
00564
00565 $fa next $ns "" --> $nf ; # This is the transition which creates the optionality.
00566 return
00567 }
00568
00569
00570
00571 ret ::grammar::fa::op::union (type fa , type fb , optional mapvar ={)} {
00572 # We union the input symbols, then add the states and
00573 # transitions of the second FA to the first, adding in
00574 # epsilons for the start and final states as well. When
00575 # adding states we make sure that the new states do not
00576 # intersect with the existing states.
00577
00578 struct::list assign \
00579 [MergePrepare $fa $fb union smap] \
00580 astart afinal bstart bfinal
00581
00582 if {$mapvar ne ""} {
00583 upvar 1 $mapvar map
00584 map = $smap
00585 }
00586
00587
00588
00589 ns = [NewState $fa s]
00590 nf = [NewState $fa f]
00591
00592 eLink1N $fa $ns $astart
00593 eLink1N $fa $ns $bstart
00594
00595 eLinkN1 $fa $afinal $nf
00596 eLinkN1 $fa $bfinal $nf
00597
00598 $fa start clear ; $fa start add $ns
00599 $fa final clear ; $fa final add $nf
00600 return
00601 }
00602
00603
00604
00605 ret ::grammar::fa::op::intersect (type fa , type fb , optional mapvar ={) {idstart 0}} {
00606
00607
00608
00609
00610
00611
00612 struct::list assign [CrossPrepare $fa $fb intersection] tmp res
00613
00614
00615
00616
00617
00618 id = $idstart
00619 smap = {}
00620 bstart = [$tmp startstates]
00621 foreach a [$fa startstates] {
00622 foreach b $bstart {
00623 pair = [list $a $b]
00624 lappend smap $id $pair
00625 lappend pending $pair $id
00626 $res state add $id
00627 $res start add $id
00628 incr id
00629 }
00630 }
00631
00632 cp = [DoCross $fa $tmp $res $id $pending smap]
00633
00634 foreach {id pair} $smap {
00635 struct::list assign $pair a b
00636 if {[$fa final? $a] && [$tmp final? $b]} {
00637 $res final add $id
00638 }
00639 }
00640
00641
00642 trim $res
00643 if {$mapvar ne ""} {
00644 upvar 1 $mapvar map
00645
00646
00647 map = {}
00648 foreach {id pair} $smap {
00649 if {![$res state exists $id]} continue
00650 lappend map $id $pair
00651 }
00652 }
00653
00654
00655 $fa = $res
00656 $res destroy
00657 if {$tmp ne $fb} {$tmp destroy}
00658 return
00659 }
00660
00661
00662
00663 ret ::grammar::fa::op::difference (type fa , type fb , optional mapvar ={)} {
00664 # Difference has to run the two automata in parallel, using
00665 # paired states. Only the final states are defined differently
00666 # than for intersection. It has to be final in fa and _not_ final
00667 # in fb to be a final state of the result. <=> Accepted by A, but
00668 # not B, to be in the difference.
00669
00670 struct::list assign [CrossPrepare $fa $fb difference] tmp res
00671
00672 # The start states of the new FA consist of the cross-product of
00673 # the start states of fa with fb. These are also the states used
00674 # to seed DoCross.
00675
00676 set id 0
00677 set smap {}
00678 bstart = [$tmp startstates]
00679 foreach a [$fa startstates] {
00680 foreach b $bstart {
00681 pair = [list $a $b]
00682 lappend smap $id $pair
00683 lappend pending $pair $id
00684 $res state add $id
00685 $res start add $id
00686 incr id
00687 }
00688 }
00689
00690 cp = [DoCross $fa $tmp $res $id $pending smap]
00691
00692 foreach {id pair} $smap {
00693 struct::list assign $pair a b
00694 if {[$fa final? $a] && ![$tmp final? $b]} {
00695 $res final add $id
00696 }
00697 }
00698
00699
00700 trim $res
00701 if {$mapvar ne ""} {
00702 upvar 1 $mapvar map
00703
00704
00705 map = {}
00706 foreach {id pair} $smap {
00707 if {![$res state exists $id]} continue
00708 lappend map $id $pair
00709 }
00710 }
00711
00712
00713 $fa = $res
00714 $res destroy
00715 if {$tmp ne $fb} {$tmp destroy}
00716 return
00717 }
00718
00719
00720
00721 ret ::grammar::fa::op::concatenate (type fa , type fb , optional mapvar ={)} {
00722 # Like union, only the interconnect between existing and new FA is different.
00723
00724 struct::list assign \
00725 [MergePrepare $fa $fb concatenate smap] \
00726 astart afinal bstart bfinal
00727
00728 if {$mapvar ne ""} {
00729 upvar 1 $mapvar map
00730 map = $smap
00731 }
00732
00733 ns = [NewState $fa s]
00734 nm = [NewState $fa m] ;
00735 nf = [NewState $fa f]
00736
00737 eLink1N $fa $ns $astart
00738 eLinkN1 $fa $afinal $nm
00739
00740 eLink1N $fa $nm $bstart
00741 eLinkN1 $fa $bfinal $nf
00742
00743 $fa start clear ; $fa start add $ns
00744 $fa final clear ; $fa final add $nf
00745 return
00746 }
00747
00748
00749
00750
00751 ret ::grammar::fa::op::fromRegex (type fa , type regex , optional over ={)} {
00752 # Convert a regular expression into a FA. The regex is given as
00753 # parse tree in the form of a nested list.
00754
00755 # {. A B ...} ... Concatenation (accepts zero|one arguments).
00756
00757
00758
00759
00760
00761
00762
00763
00764
00765 if {![llength $regex]} {
00766 $fa clear
00767 return
00768 }
00769
00770 tmp = [[cons] %AUTO%]
00771
00772 if {![llength $over]} {
00773 over = [lsort -uniq [RESymbols $regex]]
00774 }
00775 foreach sym $over {
00776 $tmp symbol add $sym
00777 }
00778
00779 id = 0
00780 struct::list assign [Regex $tmp $regex id] s f
00781 $tmp start [list = $s]
00782 $tmp final [list = $f]
00783
00784 $fa = $tmp
00785 $tmp destroy
00786 return
00787 }
00788
00789
00790
00791
00792 ret ::grammar::fa::op::RESymbols (type regex) {
00793 set cmd [lindex $regex 0]
00794 switch -exact -- $cmd {
00795 ? - * - ! - + {
00796 return [RESymbols [lindex $regex 1]]
00797 }
00798 . - | - & {
00799 set res {}
00800 foreach sub [lrange $regex 1 end] {
00801 foreach sym [RESymbols $sub] {lappend res $sym}
00802 }
00803 return $res
00804 }
00805 S {
00806 return [list [lindex $regex 1]]
00807 }
00808 default {
00809 return -code error "Expected . ! ? * | &, or S, got \"$cmd\""
00810 }
00811 }
00812 }
00813
00814 ret ::grammar::fa::op::Regex (type fa , type regex , type idvar) {
00815 upvar 1 $idvar id
00816 set cmd [lindex $regex 0]
00817 switch -exact -- $cmd {
00818 ? {
00819 # Optional
00820 set a $id ; incr id ; $fa state add $a
00821 set b $id ; incr id ; $fa state add $b
00822
00823 struct::list assign [Regex $fa [lindex $regex 1] id] s f
00824 $fa next $a "" --> $s
00825 $fa next $f "" --> $b
00826 $fa next $a "" --> $b
00827 }
00828 * {
00829 # Kleene
00830 set a $id ; incr id ; $fa state add $a
00831 set b $a
00832
00833 struct::list assign [Regex $fa [lindex $regex 1] id] s f
00834 $fa next $a "" --> $s
00835 $fa next $f "" --> $a ;# == b
00836 }
00837 + {
00838 # Pos. Kleene
00839 set a $id ; incr id ; $fa state add $a
00840 set b $id ; incr id ; $fa state add $b
00841
00842 struct::list assign [Regex $fa [lindex $regex 1] id] s f
00843 $fa next $a "" --> $s
00844 $fa next $f "" --> $b
00845 $fa next $b "" --> $a
00846 }
00847 ! {
00848 # Complement.
00849 # Build up in a temp FA, complement, and
00850 # merge nack into the current
00851
00852 set a $id ; incr id ; $fa state add $a
00853 set b $id ; incr id ; $fa state add $b
00854
00855 set tmp [[cons] %AUTO%]
00856 foreach sym [$fa symbols] {$tmp symbol add $sym}
00857 struct::list assign [Regex $tmp [lindex $regex 1] id] s f
00858 $tmp start add $s
00859 $tmp final add $f
00860
00861 determinize $tmp {} $id
00862 incr id [llength [$tmp states]]
00863 if {![$tmp is complete]} {
00864 complete $tmp $id
00865 incr id
00866 }
00867 complement $tmp
00868
00869 # Merge and link.
00870 $fa deserialize_merge [$tmp serialize]
00871
00872 eLink1N $fa $a [$tmp startstates]
00873 eLinkN1 $fa [$tmp finalstates] $b
00874 $tmp destroy
00875 }
00876 & {
00877 # Intersection ... /And
00878
00879 if {[llength $regex] < 3} {
00880 # Optimized path. Intersection of one sub-expression
00881 # is the sub-expression itself.
00882
00883 struct::list assign [Regex $fa [lindex $regex 1] id] a b
00884 } else {
00885 set a $id ; incr id ; $fa state add $a
00886 set b $id ; incr id ; $fa state add $b
00887
00888 set tmp [[cons] %AUTO%]
00889 foreach sym [$fa symbols] {$tmp symbol add $sym}
00890 set idsub 0
00891 struct::list assign [Regex $tmp [lindex $regex 1] idsub] s f
00892 $tmp start add $s
00893 $tmp final add $f
00894
00895 set beta [[cons] %AUTO%]
00896 foreach sub [lrange $regex 2 end] {
00897 foreach sym [$fa symbols] {$beta symbol add $sym}
00898 struct::list assign [Regex $beta $sub idsub] s f
00899 $beta start add $s
00900 $beta final add $f
00901 intersect $tmp $beta {} $id
00902 }
00903 $beta destroy
00904 determinize $tmp {} $id
00905 incr id [llength [$tmp states]]
00906
00907 # Merge and link.
00908 $fa deserialize_merge [$tmp serialize]
00909
00910 eLink1N $fa $a [$tmp startstates]
00911 eLinkN1 $fa [$tmp finalstates] $b
00912 $tmp destroy
00913 }
00914 }
00915 . {
00916 # Concatenation ...
00917
00918 if {[llength $regex] == 1} {
00919 # Optimized path. No sub-expressions. This represents
00920 # language containing only the empty string, aka
00921 # epsilon.
00922
00923 set a $id ; incr id ; $fa state add $a
00924 set b $id ; incr id ; $fa state add $b
00925 $fa next $a "" --> $b
00926
00927 } elseif {[llength $regex] == 2} {
00928 # Optimized path. Concatenation of one sub-expression
00929 # is the sub-expression itself.
00930
00931 struct::list assign [Regex $fa [lindex $regex 1] id] a b
00932 } else {
00933 set first 1
00934 set last {}
00935 foreach sub [lrange $regex 1 end] {
00936 struct::list assign [Regex $fa $sub id] s f
00937 if {$first} {set first 0 ; set a $s}
00938 if {$last != {}} {
00939 $fa next $last "" --> $s
00940 }
00941 set last $f
00942 }
00943 set b $f
00944 }
00945 }
00946 | {
00947 # Alternatives ... (Union)
00948
00949 if {[llength $regex] == 1} {
00950 # Optimized path. No sub-expressions. This represents
00951 # the empty language, i.e. the language without words.
00952
00953 set a $id ; incr id ; $fa state add $a
00954 set b $id ; incr id ; $fa state add $b
00955
00956 } elseif {[llength $regex] == 2} {
00957 # Optimized path. Choice/Union of one sub-expression
00958 # is the sub-expression itself.
00959
00960 struct::list assign [Regex $fa [lindex $regex 1] id] a b
00961 } else {
00962 set a $id ; incr id ; $fa state add $a
00963 set b $id ; incr id ; $fa state add $b
00964 foreach sub [lrange $regex 1 end] {
00965 struct::list assign [Regex $fa $sub id] s f
00966 $fa next $a "" --> $s
00967 $fa next $f "" --> $b
00968 }
00969 }
00970 }
00971 S {
00972 # Atom, base transition.
00973 set sym [lindex $regex 1]
00974 set a $id ; incr id ; $fa state add $a
00975 set b $id ; incr id ; $fa state add $b
00976 $fa next $a $sym --> $b
00977 }
00978 default {
00979 return -code error "Expected . ! ? * | &, or S, got \"$cmd\""
00980 }
00981 }
00982 return [list $a $b]
00983 }
00984
00985
00986
00987 ret ::grammar::fa::op::CrossPrepare (type fa , type fb , type label) {
00988 set starta [$fa startstates]
00989 set finala [$fa finalstates]
00990 set startb [$fb startstates]
00991 set finalb [$fb finalstates]
00992 if {
00993 ![llength $starta] || ![llength $finala] ||
00994 ![llength $startb] || ![llength $finalb]
00995 } {
00996 return -code error "Unable to perform the $label of two FAs without start/final states"
00997 }
00998
00999 # The inputs are made complete over the union of their symbol
01000 # sets. A temp. container is used for the second input if necessary.
01001
01002 set totals [struct::set union [$fa symbols] [$fb symbols]]
01003 foreach sym [struct::set difference $totals [$fa symbols]] {
01004 $fa symbol add $sym
01005 }
01006 if {![$fa is epsilon-free]} {
01007 remove_eps $fa
01008 trim $fa
01009 }
01010 if {![$fa is complete]} {
01011 complete $fa
01012 }
01013 set tmp $fb
01014 set bnew [struct::set difference $totals [$fb symbols]]
01015 if {[llength $bnew]} {
01016 set tmp [[cons] %AUTO% = $fb]
01017 foreach sym $bnew {
01018 $tmp symbol add $sym
01019 }
01020 }
01021 if {![$fb is epsilon-free]} {
01022 if {$tmp eq $fb} {set tmp [[cons] %AUTO% = $fb]}
01023 remove_eps $tmp
01024 trim $tmp
01025 }
01026 if {![$fb is complete]} {
01027 if {$tmp eq $fb} {set tmp [[cons] %AUTO% = $fb]}
01028 complete $tmp
01029 }
01030
01031 set res [[cons] %AUTO%]
01032 foreach sym $totals {
01033 $res symbol add $sym
01034 }
01035
01036 return [list $tmp $res]
01037 }
01038
01039
01040
01041 ret ::grammar::fa::op::DoCross (type fa , type fb , type res , type id , type seed , type smapvar) {
01042 upvar 1 $smapvar smap
01043
01044 set symbols [$fa symbols]
01045 array set tmp $seed
01046
01047 set pending $seed
01048 while {[llength $pending]} {
01049 set cpair [struct::list shift pending]
01050 set cid [struct::list shift pending]
01051
01052 struct::list assign $cpair a b
01053
01054 # ASSERT: /res state exists /cid
01055
01056 # Generate the transitions for the pair, add the resulting
01057 # destinations to the FA, and schedule them for a visit if
01058 # they are new.
01059
01060 foreach sym $symbols {
01061 set adestinations [$fa next $a $sym]
01062 set bdestinations [$fb next $b $sym]
01063
01064 foreach ad $adestinations {
01065 foreach bd $bdestinations {
01066 set dest [list $ad $bd]
01067
01068 if {![info exists tmp($dest)]} {
01069 $res state add $id
01070 lappend smap $id $dest
01071 lappend pending $dest $id
01072 set tmp($dest) $id
01073 incr id
01074 }
01075 $res next $cid $sym --> $tmp($dest)
01076 }
01077 }
01078 }
01079 }
01080 return
01081 }
01082
01083
01084
01085 ret ::grammar::fa::op::MergePrepare (type fa , type fb , type label , type mapvar) {
01086 upvar 1 $mapvar map
01087
01088 set starta [$fa startstates]
01089 set finala [$fa finalstates]
01090 set startb [$fb startstates]
01091 set finalb [$fb finalstates]
01092 if {
01093 ![llength $starta] || ![llength $finala] ||
01094 ![llength $startb] || ![llength $finalb]
01095 } {
01096 return -code error "Unable to $label FAs without start/final states"
01097 }
01098
01099 # FUTURE: add {*}[symbols], ignore dup's
01100 foreach sym [$fb symbols] {catch {$fa symbol add $sym}}
01101
01102 set dup [struct::set intersect [$fa states] [$fb states]]
01103 if {![llength $dup]} {
01104 # The states do not overlap. A plain merge of fb is enough to
01105 # copy the information.
01106
01107 $fa deserialize_merge [$fb serialize]
01108 set map {}
01109 } else {
01110 # We have duplicate states, therefore we have to remap fb to
01111 # prevent interference between the two.
01112
01113 set map {}
01114 set tmp [[cons] %AUTO% = $fb]
01115 set id 0
01116 foreach s $dup {
01117 # The renaming process has to ensure that the new name is
01118 # in neither fa, nor already in fb as well.
01119 while {
01120 [$fa state exists $id] ||
01121 [$tmp state exists $id]
01122 } {incr id}
01123 $tmp state rename $s $id
01124 lappend map $id $s
01125 incr id
01126 }
01127
01128 set startb [$tmp startstates]
01129 set finalb [$tmp finalstates]
01130
01131 $fa deserialize_merge [$tmp serialize]
01132 $tmp destroy
01133 }
01134
01135 return [list $starta $finala $startb $finalb]
01136 }
01137
01138
01139
01140 ret ::grammar::fa::op::eLink1N (type fa , type from , type states) {
01141 foreach s $states {
01142 $fa next $from "" --> $s
01143 }
01144 return
01145 }
01146
01147
01148
01149 ret ::grammar::fa::op::eLinkN1 (type fa , type states , type to) {
01150 foreach s $states {
01151 $fa next $s "" --> $to
01152 }
01153 return
01154 }
01155
01156
01157
01158 ret ::grammar::fa::op::NewState (type fa , type prefix) {
01159 set newstate [FindNewState $fa $prefix]
01160 $fa state add $newstate
01161 return $newstate
01162 }
01163
01164
01165
01166 ret ::grammar::fa::op::FindNewState (type fa , type prefix) {
01167 #if {![$fa state exists $prefix]} {return $prefix}
01168 set n 0
01169 while {[$fa state exists ${prefix}.$n]} {incr n}
01170 return ${prefix}.$n
01171 }
01172
01173
01174
01175
01176 ret ::grammar::fa::op::toRegexp (type fa) {
01177 # NOTE: FUTURE - Do not go through the serialization, nor through
01178 # a matrix. The algorithm can be expressed more directly as
01179 # operations on the automaton (states and transitions).
01180
01181 set ET [ser_to_ematrix [$fa serialize]]
01182 while {[llength $ET] > 2} {
01183 set ET [matrix_drop_state $ET]
01184 }
01185 return [lindex $ET 0 1]
01186 }
01187
01188 ret ::grammar::fa::op::toRegexp2 (type fa) {
01189 # NOTE: FUTURE - See above.
01190 set ET [ser_to_ematrix [$fa serialize]]
01191 while {[llength $ET] > 2} {
01192 set ET [matrix_drop_state $ET re2]
01193 }
01194 return [lindex $ET 0 1]
01195 }
01196
01197
01198
01199
01200 ret ::grammar::fa::op::ser_to_ematrix (type ser) {
01201 if {[lindex $ser 0] ne "grammar::fa"} then {
01202 error "Expected grammar::fa automaton serialisation"
01203 }
01204 set stateL {}
01205 set n 2; foreach {state des} [lindex $ser 2] {
01206 lappend stateL $state
01207 set N($state) $n
01208 incr n
01209 }
01210 set row0 {}
01211 for {set k 0} {$k<$n} {incr k} {lappend row0 [list |]}
01212 set res [list $row0 $row0]
01213 foreach {from des} [lindex $ser 2] {
01214 set row [lrange $row0 0 1]
01215 if {[lindex $des 0]} then {lset res 0 $N($from) [list .]}
01216 if {[lindex $des 1]} then {lset row 1 [list .]}
01217 foreach to $stateL {set S($to) [list |]}
01218 foreach {symbol targetL} [lindex $des 2] {
01219 if {$symbol eq ""} then {
01220 set atom [list .]
01221 } else {
01222 set atom [list S $symbol]
01223 }
01224 foreach to $targetL {lappend S($to) $atom}
01225 }
01226 foreach to $stateL {
01227 if {[llength $S($to)] == 2} then {
01228 lappend row [lindex $S($to) 1]
01229 } else {
01230 lappend row $S($to)
01231 }
01232 }
01233 lappend res $row
01234 }
01235 return $res
01236 }
01237
01238 ret ::grammar::fa::op::matrix_drop_state (type T_, type in , optional ns =re1) {
01239 set sumcmd ${ns}::|
01240 set prodcmd ${ns}::.
01241 set T1 {}
01242 set lastcol {}
01243 foreach row $T_in {
01244 lappend T1 [lreplace $row end end]
01245 lappend lastcol [lindex $row end]
01246 }
01247 set lastrow [lindex $T1 end]
01248 set T1 [lreplace $T1 end end]
01249 set b [${ns}::* [lindex $lastcol end]]
01250 set lastcol [lreplace $lastcol end end]
01251 set res {}
01252 foreach row $T1 a $lastcol {
01253 set newrow {}
01254 foreach pos $row c $lastrow {
01255 lappend newrow [$sumcmd $pos [$prodcmd $a $b $c]]
01256 }
01257 lappend res $newrow
01258 }
01259 return $res
01260 }
01261
01262
01263
01264
01265 namespace ::grammar::fa::op::re1 {
01266 namespace export | . {\*}
01267 }
01268
01269 ret ::grammar::fa::op::re1::| (type args) {
01270 set L {}
01271
01272 # | = Choices.
01273 # Sub-choices are lifted into the top expression (foreach).
01274 # Identical choices are reduced to a single term (lsort -uniq).
01275
01276 foreach re $args {
01277 switch -- [lindex $re 0] "|" {
01278 foreach term [lrange $re 1 end] {lappend L $term}
01279 } default {
01280 lappend L $re
01281 }
01282 }
01283 set L [lsort -unique $L]
01284 if {[llength $L] == 1} then {
01285 return [lindex $L 0]
01286 } else {
01287 return [linsert $L 0 |]
01288 }
01289 }
01290
01291 ret ::grammar::fa::op::re1::. (type args) {
01292 set L {}
01293
01294 # . = Sequence.
01295 # One element sub-choices are lifted into the top expression.
01296 # Sub-sequences are lifted into the top expression.
01297
01298 foreach re $args {
01299 switch -- [lindex $re 0] "." {
01300 foreach term [lrange $re 1 end] {lappend L $term}
01301 } "|" {
01302 if {[llength $re] == 1} then {return $re}
01303 lappend L $re
01304 } default {
01305 lappend L $re
01306 }
01307 }
01308 if {[llength $L] == 1} then {
01309 return [lindex $L 0]
01310 } else {
01311 return [linsert $L 0 .]
01312 }
01313 }
01314
01315 ret ::grammar::fa::op::re1::* (type re) {
01316 # * = Kleene closure.
01317 # Sub-closures are lifted into the top expression.
01318 # One-element sub-(choices,sequences) are lifted into the top expression.
01319
01320 switch -- [lindex $re 0] "|" - "." {
01321 if {[llength $re] == 1} then {
01322 return [list .]
01323 } else {
01324 return [list * $re]
01325 }
01326 } "*" {
01327 return $re
01328 } default {
01329 return [list * $re]
01330 }
01331 }
01332
01333
01334
01335
01336 namespace ::grammar::fa::op::re2 {
01337
01338
01339 namespace import [namespace parent]::re1::|
01340 namespace import [namespace parent]::re1::\\*
01341 }
01342
01343 ret ::grammar::fa::op::re2::. (type args) {
01344
01345 # . = Sequences
01346 # Sub-sequences are lifted into the top expression.
01347 # Sub-choices are multiplied out.
01348 # <Example a(b|c) => ab|ac >
01349
01350 set L {}
01351 set n -1
01352 foreach re $args {
01353 incr n
01354 switch -- [lindex $re 0] "." {
01355 foreach term [lrange $re 1 end] {lappend L $term}
01356 } "|" {
01357 set res [list |]
01358 set L2 [lreplace $args 0 $n]
01359 foreach term [lrange $re 1 end] {
01360 lappend res [eval [list .] $L [list $term] $L2]
01361 }
01362 return [eval $res]
01363 } default {
01364 lappend L $re
01365 }
01366 }
01367 if {[llength $L] == 1} then {
01368 return [lindex $L 0]
01369 } else {
01370 return [linsert $L 0 .]
01371 }
01372 }
01373
01374
01375
01376
01377 ret ::grammar::fa::op::simplifyRegexp (type RE0) {
01378 set RE1 [namespace inscope nonnull $RE0]
01379 if {[lindex $RE1 0] eq "S" || $RE1 eq "." || $RE1 eq "|"} then {
01380 return $RE1
01381 }
01382 set tmp [grammar::fa %AUTO% fromRegex $RE1]
01383 $tmp minimize
01384 set RE1 [toRegexp $tmp]
01385 $tmp destroy
01386 if {[string length $RE1] < [string length $RE0]} then {
01387 set RE0 $RE1
01388 }
01389 if {[lindex $RE0 0] eq "S"} then {return $RE0}
01390 set res [lrange $RE0 0 0]
01391 foreach branch [lrange $RE0 1 end] {
01392 lappend res [simplifyRegexp $branch]
01393 }
01394 return $res
01395 }
01396
01397
01398
01399
01400 namespace ::grammar::fa::op::nonnull {}
01401
01402 ret ::grammar::fa::op::nonnull::| (type args) {
01403 set also_empty false
01404 set res [list |]
01405 foreach branch $args {
01406 set RE [eval $branch]
01407 if {[lindex $RE 0] eq "?"} then {
01408 set also_empty true
01409 set RE [lindex $RE 1]
01410 }
01411 switch -- [lindex $RE 0] "|" {
01412 eval [lreplace $RE 0 0 lappend res]
01413 } "." {
01414 if {[llength $RE] == 1} then {
01415 set also_empty true
01416 } else {
01417 lappend res $RE
01418 }
01419 } default {
01420 lappend res $RE
01421 }
01422 }
01423 if {!$also_empty} then {return $res}
01424 foreach branch [lrange $res 1 end] {
01425 if {[lindex $branch 0] eq "*"} then {return $res}
01426 }
01427 if {[llength $res] == 1} then {
01428 return [list .]
01429 } elseif {[llength $res] == 2} then {
01430 return [lreplace $res 0 0 ?]
01431 } else {
01432 return [list ? $res]
01433 }
01434 }
01435
01436 ret ::grammar::fa::op::nonnull::. (type args) {
01437 set res [list .]
01438 foreach branch $args {
01439 set RE [eval $branch]
01440 switch -- [lindex $RE 0] "|" {
01441 if {[llength $RE] == 1} then {return $RE}
01442 lappend res $RE
01443 } "." {
01444 eval [lreplace $RE 0 0 lappend res]
01445 } default {
01446 lappend res $RE
01447 }
01448 }
01449 return $res
01450 }
01451
01452 ret ::grammar::fa::op::nonnull::* (type sub) {
01453 set RE [eval $sub]
01454 switch -- [lindex $RE 0] "*" - "?" - "+" {
01455 return [lreplace $RE 0 0 *]
01456 } default {
01457 return [list * $RE]
01458 }
01459 }
01460
01461 ret ::grammar::fa::op::nonnull::+ (type sub) {
01462 set RE [eval $sub]
01463 switch -- [lindex $RE 0] "+" {
01464 return $RE
01465 } "*" - "?" {
01466 return [lreplace $RE 0 0 *]
01467 } default {
01468 return [list * $RE]
01469 }
01470 }
01471
01472 ret ::grammar::fa::op::nonnull::? (type sub) {
01473 set RE [eval $sub]
01474 switch -- [lindex $RE 0] "?" - "*" {
01475 return $RE
01476 } "+" {
01477 return [lreplace $RE 0 0 *]
01478 } default {
01479 return [list ? $RE]
01480 }
01481 }
01482
01483 ret ::grammar::fa::op::nonnull::S (type name) {
01484 return [list S $name]
01485 }
01486
01487
01488
01489
01490 ret ::grammar::fa::op::toTclRegexp (type re , type symdict) {
01491 return [lindex [namespace inscope tclre $re $symdict] 1]
01492 }
01493
01494
01495
01496
01497 namespace ::grammar::fa::op::tclre {}
01498
01499 ret ::grammar::fa::op::tclre::S (type name , type dict) {
01500 array set A $dict
01501 if {[info exists A($name)]} then {
01502 return $A($name)
01503 } elseif {[string length $name] == 1} then {
01504 if {[regexp {[\\\[\]{}.()*+?^$]} $name]} then {
01505 return [list char \\$name]
01506 } else {
01507 return [list char $name]
01508 }
01509 } else {
01510 return [list class "\[\[:${name}:\]\]"]
01511 }
01512 }
01513
01514 ret ::grammar::fa::op::tclre::. (type args) {
01515 set suffix [lrange $args end end]
01516 set L {}
01517 foreach factor [lrange $args 0 end-1] {
01518 set pair [eval $factor $suffix]
01519 switch -- [lindex $pair 0] "sum" {
01520 lappend L ([lindex $pair 1])
01521 } default {
01522 lappend L [lindex $pair 1]
01523 }
01524 }
01525 return [list prod [join $L ""]]
01526 }
01527
01528 ret ::grammar::fa::op::tclre::* (type re , type dict) {
01529 set pair [eval $re [list $dict]]
01530 switch -- [lindex $pair 0] "sum" - "prod" {
01531 return [list prod "([lindex $pair 1])*"]
01532 } default {
01533 return [list prod "[lindex $pair 1]*"]
01534 }
01535 }
01536
01537 ret ::grammar::fa::op::tclre::+ (type re , type dict) {
01538 set pair [eval $re [list $dict]]
01539 switch -- [lindex $pair 0] "sum" - "prod" {
01540 return [list prod "([lindex $pair 1])+"]
01541 } default {
01542 return [list prod "[lindex $pair 1]+"]
01543 }
01544 }
01545
01546 ret ::grammar::fa::op::tclre::? (type re , type dict) {
01547 set pair [eval $re [list $dict]]
01548 switch -- [lindex $pair 0] "sum" - "prod" {
01549 return [list prod "([lindex $pair 1])?"]
01550 } default {
01551 return [list prod "[lindex $pair 1]?"]
01552 }
01553 }
01554
01555 ret ::grammar::fa::op::tclre::| (type args) {
01556 set suffix [lrange $args end end]
01557 set charL {}
01558 set classL {}
01559 set prodL {}
01560 foreach factor [lrange $args 0 end-1] {
01561 set pair [eval $factor $suffix]
01562 switch -- [lindex $pair 0] "char" {
01563 lappend charL [lindex $pair 1]
01564 } "class" {
01565 lappend classL [string range [lindex $pair 1] 1 end-1]
01566 } default {
01567 lappend prodL [lindex $pair 1]
01568 }
01569 }
01570 if {[llength $charL]>1 || [llength $classL]>0} then {
01571 while {[set n [lsearch $charL -]] >= 0} {
01572 lset charL $n {\-}
01573 }
01574 set bracket "\[[join $charL ""][join $classL ""]\]"
01575 if {![llength $prodL]} then {
01576 return [list atom $bracket]
01577 }
01578 lappend prodL $bracket
01579 } else {
01580 eval [list lappend prodL] $charL
01581 }
01582 return [list sum [join $prodL |]]
01583 }
01584
01585 ret ::grammar::fa::op::tclre::& (type args) {
01586 error "Cannot express language intersection in Tcl-RE's"
01587
01588 # Note: This can be translated by constructing an automaton for
01589 # the intersection, and then translating its conversion to a
01590 # regular expression.
01591 }
01592
01593 ret ::grammar::fa::op::tclre::! (type args) {
01594 error "Cannot express language complementation in Tcl-RE's"
01595
01596 # Note: This can be translated by constructing an automaton for
01597 # the complement, and then translating its conversion to a regular
01598 # expression. This however requires knowledge regarding the set of
01599 # symbols. Large (utf-8) for Tcl regexes.
01600 }
01601
01602 /* ### ### ### ######### ######### #########*/
01603
01604 ret ::grammar::fa::op::constructor (type cmd) {
01605 variable cons $cmd
01606 return
01607 }
01608
01609 ret ::grammar::fa::op::cons () {
01610 variable cons
01611 if {$cons ne ""} {return $cons}
01612 return -code errror "No constructor for FA container was established."
01613 }
01614
01615 /* ### ### ### ######### ######### #########*/
01616 /* Package Management*/
01617
01618 package provide grammar::fa::op 0.4
01619