peg.tcl

Go to the documentation of this file.
00001 /*  -*- tcl -*-*/
00002 /* */
00003 /*  Copyright (c) 2005 by Andreas Kupries <andreas_kupries@users.sourceforge.net>*/
00004 /*  Grammars / Parsing Expression Grammars / Container*/
00005 
00006 /*  ### ### ### ######### ######### #########*/
00007 /*  Package description*/
00008 
00009 /*  A class whose instances hold all the information describing a single*/
00010 /*  parsing expression grammar (terminal symbols, nonterminal symbols,*/
00011 /*  nonterminal rules, start expression, hints), and operations to*/
00012 /*  define, manipulate, and query this information.*/
00013 /* */
00014 /*  The container has only one functionality beyond the simple storage*/
00015 /*  of the aforementioned information. It keeps track if the provided*/
00016 /*  grammar is valid (*). The container provides no higher-level*/
00017 /*  operations on the grammar, like removal of unreachable nonterminals,*/
00018 /*  rule rewriting, etc.*/
00019 /* */
00020 /*  The set of terminal symbols is the set of characters (i.e.*/
00021 /*  implicitly defined). For Tcl this means that all the unicode*/
00022 /*  characters are supported.*/
00023 /* */
00024 /*  (*) A grammar is valid if and only if all its rules are valid.  A*/
00025 /*  rule is valid if and only if all nonterminals referenced by the RHS*/
00026 /*  of the rule are in the set of nonterminals, and if only the allowed*/
00027 /*  operators are used in the expression.*/
00028 
00029 /*  ### ### ### ######### ######### #########*/
00030 /*  Requisites*/
00031 
00032 package require snit         ; /*  Tcllib | OO system used*/
00033 
00034 /*  ### ### ### ######### ######### #########*/
00035 /*  Implementation*/
00036 
00037 snit::type ::grammar::peg {
00038     /*  ### ### ### ######### ######### #########*/
00039     /*  Type API. Helpful methods for PEs.*/
00040 
00041     ret  ValidateSerial (type e , type prefix) {}
00042     ret  Validate   (type e) {}
00043     ret  References (type e) {}
00044     ret  Rename     (type e , type old , type new) {}
00045 
00046     /*  ### ### ### ######### ######### #########*/
00047     /*  Instance API*/
00048 
00049     constructor {args} {}
00050 
00051     ret  clear () {}
00052 
00053     ret  =   {src} ()
00054     method --> {dst} {}
00055     ret  serialize () {}
00056     ret  deserialize (type value) {}
00057 
00058     ret  {is valid} () {}
00059     ret  start (type args) {}
00060 
00061     ret  nonterminals () {}
00062     ret  {nonterminal add}    (type nts , type pae) {}
00063     ret  {nonterminal delete} (type nts , type pae) {}
00064     ret  {nonterminal exists} (type nts) {}
00065     ret  {nonterminal rename} (type ntsold , type ntsnew) {}
00066     ret  {nonterminal mode}   (type nts , type args) {}
00067 
00068     ret  {unknown nonterminals} () {}
00069 
00070     ret  {nonterminal rule}   (type nts) {}
00071 
00072     /*  ### ### ### ######### ######### #########*/
00073     /*  Internal data structures.*/
00074 
00075     /*  - Set of nonterminal symbols, and*/
00076     /*  - Mapping from nonterminals to their defining parsing*/
00077     /*    expressions, and*/
00078     /*  - Start parsing expression.*/
00079     /*  - And usage of nonterminals by others, required for tracking*/
00080     /*    of validity.*/
00081 
00082     /*  se: expression               | Start expression*/
00083     /*  nt: nonterm -> expression    | Known Nt's, their rules*/
00084     /*  re: nonterm -> list(nonterm) | Known Nt's, what others they use.*/
00085     /*  ir: nonterm -> list(nonterm) | Nt's, possibly unknown, their users.*/
00086     /*  uk: nonterm -> use counter   | Nt's which are unknown.*/
00087     /** 
00088      *# Both 're' and 'ir' can list a nonterminal A multiple times,
00089      *# if it uses or is used multiple times.
00090      *#
00091      *# Grammar is invalid <=> '[array size uk] > 0'
00092  */
00093 
00094     variable se        epsilon
00095     variable nt -array {}
00096     variable re -array {}
00097     variable ir -array {}
00098     variable uk -array {}
00099     variable mo -array {}
00100 
00101     /*  ### ### ### ######### ######### #########*/
00102     /*  Instance API Implementation.*/
00103 
00104     constructor {args} {
00105     if {
00106         (([llength $args] != 0) && ([llength $args] != 2)) ||
00107         (([llength $args] == 2) && ([lsearch {= := <-- as deserialize} [lindex $args 0]]) < 0)
00108     } {
00109         return -code error "wrong/* args: $self ?=|:=|<--|as|deserialize a'?"*/
00110     }
00111 
00112     /*  Serialization arguments.*/
00113     /*  [llength args] in {0 2}*/
00114     /* */
00115     /*  =           src-obj*/
00116     /*  :=          src-obj*/
00117     /*  <--         src-obj*/
00118     /*  as          src-obj*/
00119     /*  deserialize src-value*/
00120 
00121     if {[llength $args] == 2} {
00122         foreach {op val} $args break
00123         switch -exact -- $op {
00124         = - := - <-- - as {
00125             $self deserialize [$val serialize]
00126         }
00127         deserialize {
00128             $self deserialize $val
00129         }
00130         }
00131     }
00132     return
00133     }
00134 
00135     /* destructor {}*/
00136 
00137     ret  clear () {
00138     array unset nt *
00139     array unset re *
00140     array unset ir *
00141     array unset uk *
00142     array unset mo *
00143     set se epsilon
00144     return
00145     }
00146 
00147     ret  = {src} (
00148     $type self , type dserialize [$, type src , type serialize]
00149     )
00150 
00151     method --> {dst} {
00152     $dst deserialize [$self serialize]
00153     }
00154 
00155     ret  serialize () {
00156     return [::list \
00157         grammar::pegc \
00158         [array get nt] \
00159         [array get mo] \
00160         $se]
00161     }
00162 
00163     ret  deserialize (type value) {
00164     # Validate value, then clear and refill.
00165 
00166     $self CheckSerialization $value ntv mov sev
00167     $self clear
00168 
00169     foreach {s e} $ntv {
00170         $self NtAdd $s $e
00171     }
00172     array set mo $mov
00173     $self start $sev
00174     return
00175     }
00176 
00177     ret  {is valid} () {
00178     return [expr {[array size uk] == 0}]
00179     }
00180 
00181     ret  start (type args) {
00182     if {[llength $args] == 0} {
00183         return $se
00184     }
00185     if {[llength $args] > 1} {
00186         return -code error "wrong#args: $self start ?pe?"
00187     }
00188     set newse [lindex $args 0]
00189     Validate $newse
00190     set se   $newse
00191     return
00192     }
00193 
00194     ret  nonterminals () {
00195     return [array names nt]
00196     }
00197 
00198     ret  {nonterminal add} (type nts , type pae) {
00199     $self CheckNtKnown $nts
00200     Validate $pae
00201     $self NtAdd $nts $pae
00202     return
00203     }
00204 
00205     ret  {nonterminal mode} (type nts , type args) {
00206     $self CheckNt $nts
00207     if {![llength $args]} {
00208         return $mo($nts)
00209     } elseif {[llength $args] == 1} {
00210         set mo($nts) [lindex $args 0]
00211         return
00212     } else {
00213         return -code error "wrong#args"
00214     }
00215     return
00216     }
00217 
00218     ret  {nonterminal delete} (type nts , type args) {
00219     set args [linsert $args 0 $nts]
00220     foreach nts $args {
00221         $self CheckNt $nts
00222     }
00223 
00224     foreach nts $args {
00225         $self NtDelete $nts
00226     }
00227     return
00228     }
00229 
00230     ret  {nonterminal exists} (type nts) {
00231     return [info exists nt($nts)]
00232     }
00233 
00234     ret  {nonterminal rename} (type ntsold , type ntsnew) {
00235     $self CheckNt      $ntsold
00236     $self CheckNtKnown $ntsnew
00237 
00238     # Difficult. We have to go through all rules and rewrite their
00239     # RHS to use the new name of the nonterminal. We can however
00240     # restrict ourselves to the rules which actually use the
00241     # changed nonterminal.
00242 
00243     # We also have to update the used/user information. We know
00244     # that the validity of the grammar is unchanged by this
00245     # operation. The unknown information is unchanged as well, as
00246     # we cannot rename an unknown nonterminal. IOW we know that
00247     # 'ntsold' is not in 'uk', and so 'ntsnew' will not be in that
00248     # array either after the rename.
00249 
00250     set myusers $ir($ntsold)
00251     set myused  $re($ntsold)
00252 
00253     set nt($ntsnew) $nt($ntsold)
00254     unset            nt($ntsold)
00255 
00256     set mo($ntsnew) $mo($ntsold)
00257     unset            mo($ntsold)
00258 
00259     foreach x $myusers {
00260         set nt($x) [Rename $nt($x) $ntsold $ntsnew]
00261     }
00262 
00263     # It is possible to use myself, and be used by myself.
00264 
00265     while {[set pos [lsearch -exact $myusers $ntsold]] >= 0} {
00266         set myusers [lreplace $myusers $pos $pos $ntsnew]
00267     }
00268     while {[set pos [lsearch -exact $myused $ntsold]] >= 0} {
00269         set myused [lreplace $myused $pos $pos $ntsnew]
00270     }
00271 
00272     set re($ntsnew) $myusers
00273     set ir($ntsnew) $myused
00274 
00275     unset            re($ntsold)
00276     unset            ir($ntsold)
00277     return
00278     }
00279 
00280     ret  {unknown nonterminals} () {
00281     return [array names uk]
00282     }
00283 
00284     ret  {nonterminal rule} (type nts) {
00285     $self CheckNt $nts
00286     return $nt($nts)
00287     }
00288 
00289     /*  ### ### ### ######### ######### #########*/
00290     /*  Internal helper methods*/
00291 
00292     ret  NtAdd (type nts , type pae) {
00293     # None of the symbols is known. We can add them to the
00294     # grammar. If however any of their PEs is known to the PE
00295     # storage then we had expressions refering to unknown
00296     # symbols. The grammar is most certainly invalid and may have
00297     # become valid right now. We have to invalidate the validity
00298     # cache.
00299 
00300     set nt($nts) $pae
00301     set mo($nts) value
00302 
00303     # Track users, uses, and unknowns.
00304 
00305     set references [References $pae]
00306 
00307     # We use the refered symbols
00308     set re($nts) $references
00309 
00310     # We are a user for the refered symbols
00311     # Record unknown symbols immediately.
00312     foreach x $references {
00313         lappend ir($x) $nts
00314         if {[info exists nt($x)]} continue
00315         if {[catch {incr uk($x)}]} {set uk($x) 1}
00316     }
00317 
00318     # We are definitely not unknown.
00319     unset -nocomplain uk($nts)
00320     return
00321     }
00322 
00323     ret  NtDelete (type nts) {
00324     set references $re($nt)
00325 
00326     # We are gone. We are not using anything anymore.
00327     unset    nt($nts)
00328     unset    re($nts)
00329     unset    mo($nts)
00330 
00331     # Our references loose us as their user.
00332     foreach x $references {
00333         set pos [lsearch -exact $ir($x) $x]
00334         if {$pos < 0} {error PANIC}
00335         set ir($x) [lreplace $ir($x) $pos $pos]
00336         if {[llength $ir($x)] == 0} {
00337         unset ir($x)
00338         # x is not referenced anywhere, cannot be unknown.
00339         unset -nocomplain uk($x)
00340         }
00341         if {[info exists uk($x)]} {
00342         incr uk($x) -1
00343         }
00344     }
00345 
00346     # We might be used by others still, and therefore become
00347     # unknown.
00348 
00349     if {[info exists ir($nts]} {
00350         set uk($nts) [llength $ir($nts)]
00351     }
00352     return
00353     }
00354 
00355     ret  CheckNt (type nts) {
00356     if {![info exists nt($nts)]} {
00357         return -code error "Invalid nonterminal \"$nts\""
00358     }
00359     return
00360     }
00361 
00362     ret  CheckNtKnown (type nts) {
00363     if {[info exists nt($nts)]} {
00364         return -code error "Nonterminal \"$nts\" is already known"
00365     }
00366     return
00367     }
00368 
00369     ret  CheckSerialization (type value , type ntv , type mov , type sev) {
00370     # value is list/3 ('grammar::pegc' nonterminals start)
00371     # terminals is list of string.
00372     # nonterminals is doct (key is string, value is expr)
00373     # start is expr
00374     # terminals * nonterminals == empty
00375     # expr is parsing expression (Validate PE).
00376 
00377     upvar 1 \
00378         $ntv ntvs \
00379         $mov movs \
00380         $sev sevs
00381 
00382     set prefix "error in serialization:"
00383     if {[llength $value] != 4} {
00384         return -code error "$prefix list length not 4"
00385     }
00386 
00387     struct::list assign $value type nonterminals hints start
00388     if {$type ne "grammar::pegc"} {
00389         return -code error "$prefix unknown type \"$type\""
00390     }
00391 
00392     ValidateSerial $start "$prefix invalid start expression"
00393 
00394     if {[llength $nonterminals] % 2 == 1} {
00395         return -code error "$prefix nonterminal data is not a dictionary"
00396     }
00397     array set _nt $nonterminals
00398     if {[llength $nonterminals] != (2*[array size _nt])} {
00399         return -code error "$prefix nonterminal data contains duplicate names, or misses some"
00400     }
00401 
00402     foreach {s e} $nonterminals {
00403         ValidateSerial $start "$prefix nonterminal \"$s\", invalid parsing expression"
00404     }
00405 
00406 
00407     if {[llength $hints] % 2 == 1} {
00408         return -code error "$prefix nonterminal modes is not a dictionary"
00409     }
00410     array set _mo $hints
00411     if {[llength $hints] != (2*[array size _mo])} {
00412         return -code error "$prefix nonterminal modes contains duplicate names, or misses some"
00413     }
00414     foreach {s _} $hints {
00415         if {![info exists _nt($s)]} {
00416         return -code error "$prefix nonterminal mode for unknown nonterminal \"$s\""
00417         }
00418     }
00419 
00420     set ntvs $nonterminals
00421     set sevs $start
00422     set movs $hints
00423     return
00424     }
00425 
00426     /*  ### ### ### ######### ######### #########*/
00427 
00428     /*  ### ### ### ######### ######### #########*/
00429     /*  Type API implementation.*/
00430 
00431     ret  ValidateSerial (type e , type prefix) {
00432     if {![catch {Validate $e} msg]} return
00433     return -code error "$prefix, $msg"
00434     }
00435 
00436     ret  Validate (type e) {
00437     if {[llength $e] == 0} {
00438         return -code error "invalid empty expression list"
00439     }
00440 
00441     set op [lindex $e 0]
00442     set ar [lrange $e 1 end]
00443 
00444     switch -exact -- $op {
00445         epsilon - alpha - alnum - dot {
00446         if {[llength $ar] > 0} {
00447             return -code error "wrong#args for \"$op\""
00448         }
00449         }
00450         .. {
00451         if {[llength $ar] != 2} {
00452             return -code error "wrong#args for \"$op\""
00453         }
00454         # Leaf, arguments are not expressions to validate.
00455         }
00456         n - t {
00457         if {[llength $ar] != 1} {
00458             return -code error "wrong#args for \"$op\""
00459         }
00460         # Leaf, argument is not expression to validate.
00461         }
00462         & - ! - * - + - ? {
00463         if {[llength $ar] != 1} {
00464             return -code error "wrong#args for \"$op\""
00465         }
00466         Validate [lindex $ar 0]
00467         }
00468         x - / {
00469         if {![llength $ar]} {
00470             return -code error "wrong#args for \"$op\""
00471         }
00472         foreach e $ar {
00473             Validate $e
00474         }
00475         }
00476         default {
00477         return -code error "invalid operator \"$op\""
00478         }
00479     }
00480     }
00481 
00482     ret  References (type e) {
00483     set references {}
00484 
00485     set op [lindex $e 0]
00486     set ar [lrange $e 1 end]
00487 
00488     switch -exact -- $op {
00489         epsilon - t - alpha - alnum - dot - .. {}
00490         n {
00491         # Remember referenced nonterminal
00492         lappend references [lindex $ar 0]
00493         }
00494         & - ! - * - + - ? {
00495         foreach r [References [lindex $ar 0]] {
00496             lappend references $r
00497         }
00498         }
00499         x - / {
00500         foreach e $ar {
00501             foreach r [References $e] {
00502             lappend references $r
00503             }
00504         }
00505         }
00506     }
00507     return $references
00508     }
00509 
00510     ret  Rename (type e , type old , type new) {
00511     set op [lindex $e 0]
00512     set ar [lrange $e 1 end]
00513 
00514     switch -exact -- $op {
00515         epsilon - t - alpha - alnum - dot - .. {return $e}
00516         n {
00517         if {[lindex $ar 0] ne $old} {return $e}
00518         return [list n $new]
00519         }
00520         & - ! - * - + - ? {
00521         return [list $op [Rename [lindex $ar 0] $old $new]]
00522         }
00523         x - / {
00524         set res $op
00525         foreach e $ar {
00526             lappend res [Rename $e $old $new]
00527         }
00528         return $res
00529         }
00530     }
00531     }
00532 
00533     /*  ### ### ### ######### ######### #########*/
00534     /*  Type Internals.*/
00535 
00536     /*  ### ### ### ######### ######### #########*/
00537 }
00538 
00539 /*  ### ### ### ######### ######### #########*/
00540 /*  Package Management*/
00541 
00542 package provide grammar::peg 0.1
00543 

Generated on 21 Sep 2010 for Gui by  doxygen 1.6.1