cgen.tcl

Go to the documentation of this file.
00001 /*  cgen.tcl --*/
00002 /* */
00003 /*  Generator core for compiler of magic(5) files into recognizers*/
00004 /*  based on the 'rtcore'.*/
00005 /* */
00006 /*  Copyright (c) 2004-2005 Colin McCormack <coldstore@users.sourceforge.net>*/
00007 /*  Copyright (c) 2005      Andreas Kupries <andreas_kupries@users.sourceforge.net>*/
00008 /* */
00009 /*  See the file "license.terms" for information on usage and redistribution*/
00010 /*  of this file, and for a DISCLAIMER OF ALL WARRANTIES.*/
00011 /*  */
00012 /*  RCS: @(#) $Id: cgen.tcl,v 1.7 2007/06/23 03:39:34 andreas_kupries Exp $*/
00013 
00014 /* */
00015 /* */
00016 /*  "mime type recognition in pure tcl"*/
00017 /*  http://wiki.tcl.tk/12526*/
00018 /* */
00019 /*  Tcl code harvested on:  10 Feb 2005, 04:06 GMT*/
00020 /*  Wiki page last updated: ???*/
00021 /* */
00022 /* */
00023 
00024 /*  ### ### ### ######### ######### #########*/
00025 /*  Requirements*/
00026 
00027 package require Tcl 8.4
00028 package require fileutil::magic::rt ; /*  Runtime core, for Access to the typemap*/
00029 package require struct::list        ; /*  Our data structures.*/
00030 package require struct::tree        ; /* */
00031 
00032 package provide fileutil::magic::cgen 1.0
00033 
00034 /*  ### ### ### ######### ######### #########*/
00035 /*  Implementation*/
00036 
00037 namespace ::fileutil::magic::cgen {
00038     /*  Import the runtime typemap into our scope.*/
00039     variable ::fileutil::magic::rt::typemap
00040 
00041     /*  The tree most operations use for their work.*/
00042     variable tree {}
00043 
00044     /*  Generator data structure.*/
00045     variable regions
00046 
00047     /*  Type mapping for indirect offsets.*/
00048     /*  empty -> long/Q, because this uses native byteorder.*/
00049 
00050     array  otmap =  {
00051         .b c    .B c
00052         .s s    .S S
00053         .l i    .L I
00054     {} Q
00055     }
00056 
00057     /*  Export the API*/
00058     namespace export 2tree treedump treegen
00059 }
00060 
00061 
00062 /*  Optimisations:*/
00063 
00064 /*  reorder tests according to expected or observed frequency this*/
00065 /*  conflicts with reduction in strength optimisations.*/
00066 
00067 /*  Rewriting within a level will require pulling apart the list of*/
00068 /*  tests at that level and reordering them.  There is an inconsistency*/
00069 /*  between handling at 0-level and deeper level - this has to be*/
00070 /*  removed or justified.*/
00071 
00072 /*  Hypothetically, every test at the same level should be mutually*/
00073 /*  exclusive, but this is not given, and should be detected.  If true,*/
00074 /*  this allows reduction in strength to switch on Numeric tests*/
00075 
00076 /*  reduce Numeric tests at the same level to switches*/
00077 /* */
00078 /*  - first pass through clauses at same level to categorise as*/
00079 /*    variant values over same test (type and offset).*/
00080 
00081 /*  work out some way to cache String comparisons*/
00082 
00083 /*  Reduce seek/reads for String comparisons at same level:*/
00084 /* */
00085 /*  - first pass through clauses at same level to determine string ranges.*/
00086 /* */
00087 /*  - String tests at same level over overlapping ranges can be*/
00088 /*    written as sub-string comparisons over the maximum range*/
00089 /*    this saves re-reading the same string from file.*/
00090 /* */
00091 /*  - common prefix strings will have to be guarded against, by*/
00092 /*    sorting string values, then sorting the tests in reverse length order.*/
00093 
00094 
00095 ret  ::fileutil::magic::cgen::path (type tree) {
00096     # Annotates the tree. In each node we store the path from the root
00097     # to this node, as list of nodes, with the current node the last
00098     # element. The root node is never stored in the path.
00099 
00100     $tree set root path {}
00101     foreach child [$tree children root] {
00102     $tree walk $child -type dfs node {
00103         set path [$tree get [$tree parent $node] path]
00104         lappend path [$tree index $node]
00105         $tree set $node path $path
00106     }
00107     }
00108     return
00109 }
00110 
00111 ret  ::fileutil::magic::cgen::tree_el (type tree , type parent , type file , type line , type type , type qual , type comp , type offset , type val , type message , type args) {
00112 
00113     # Recursively creates and annotates a node for the specified
00114     # tests, and its sub-tests (args).
00115 
00116     set     node [$tree insert $parent end]
00117     set     path [$tree get    $parent path]
00118     lappend path [$tree index  $node]
00119     $tree set $node path $path
00120 
00121     # generate a proc call type for the type, Numeric or String
00122     variable ::fileutil::magic::rt::typemap
00123 
00124     switch -glob -- $type {
00125     *byte* -
00126     *short* -
00127     *long* -
00128     *date* {
00129         set otype N
00130         set type [lindex $typemap($type) 1]
00131     }
00132     *string {
00133         set otype S
00134     }
00135     default {
00136         puts stderr "Unknown type: '$type'"
00137     }
00138     }
00139 
00140     # Stores the type determined above, and the arguments into
00141     # attributes of the new node.
00142 
00143     foreach key {line type qual comp offset val message file otype} {
00144     if {[catch {
00145         $tree set $node $key [set $key]
00146     } result]} {
00147         upvar ::errorInfo eo
00148         puts "Tree: $eo - $file $line $type"
00149     }
00150     }
00151 
00152     # now add children
00153     foreach el $args {
00154     eval [linsert $el 0 tree_el $tree $node $file]
00155     # 8.5 # tree_el $tree $node $file {*}$el
00156     }
00157     return $node
00158 }
00159 
00160 ret  ::fileutil::magic::cgen::2tree (type script) {
00161 
00162     # Converts a recognizer which is in a simple script form into a
00163     # tree.
00164 
00165     variable tree
00166     set tree [::struct::tree]
00167 
00168     $tree set root path ""
00169     $tree set root otype Root
00170     $tree set root type root
00171     $tree set root message "unknown"
00172 
00173     # generate a test for each match
00174     set file "unknown"
00175     foreach el $script {
00176     #puts "EL: $el"
00177     if {[lindex $el 0] eq "file"} {
00178         set file [lindex $el 1]
00179     } else {
00180         set node [eval [linsert $el 0 tree_el $tree root $file]]
00181         # 8.5 # set more [tree_el $tree root $file {*}$el]
00182         append result $node
00183     }
00184     }
00185     optNum $tree root
00186     #optStr $tree root
00187     puts stderr "Script contains [llength [$tree children root]] discriminators"
00188     path $tree
00189 
00190     # Decoding the offsets, determination if we have to handle
00191     # relative offsets, and where. The less, the better.
00192     Offsets $tree
00193 
00194     return $tree
00195 }
00196 
00197 ret  ::fileutil::magic::cgen::isStr (type tree , type node) {
00198     return [expr {"S" eq [$tree get $node otype]}]
00199 }
00200 
00201 ret  ::fileutil::magic::cgen::sortRegion (type r1 , type r2) {
00202     set cmp 0
00203     if {[catch {
00204     if {[string match (*) $r1] || [string match (*) $r2]} {
00205         set cmp [string compare $r1 $r2]
00206     } else {
00207         set cmp [expr {[lindex $r1 0] - [lindex $r2 0]}]
00208         if {!$cmp} {
00209         set cmp 0
00210         set cmp [expr {[lindex $r1 1] - [lindex $r2 1]}]
00211         }
00212     }
00213     } result]} {
00214     set cmp [string compare $r1 $r2]
00215     }
00216     return $cmp
00217 }
00218 
00219 ret  ::fileutil::magic::cgen::optStr (type tree , type node) {
00220     variable regions
00221     catch {unset regions}
00222     array set regions {}
00223 
00224     optStr1 $tree $node
00225 
00226     puts stderr "Regions [array statistics regions]"
00227     foreach region [lsort \
00228         -index   0 \
00229         -command ::fileutil::magic::cgen::sortRegion \
00230         [array name regions]] {
00231     puts "$region - $regions($region)"
00232     }
00233 }
00234 
00235 ret  ::fileutil::magic::cgen::optStr1 (type tree , type node) {
00236     variable regions
00237 
00238     # traverse each numeric element of this node's children,
00239     # categorising them
00240 
00241     set kids [$tree children $node]
00242     foreach child $kids {
00243     optStr1 $tree $child
00244     }
00245 
00246     set strings [$tree children $node filter ::fileutil::magic::cgen::isStr]
00247     #puts stderr "optstr: $node: $strings"
00248 
00249     foreach el $strings {
00250     #if {[$tree get $el otype] eq "String"} {puts "[$tree getall $el] - [string length [$tree get $el val]]"}
00251     if {[$tree get $el comp] eq "x"} {
00252         continue
00253     }
00254 
00255     set offset [$tree get $el offset]
00256     set len    [string length [$tree get $el val]]
00257     lappend regions([list $offset $len]) $el
00258     }
00259 }
00260 
00261 ret  ::fileutil::magic::cgen::isNum (type tree , type node) {
00262     return [expr {"N" eq [$tree get $node otype]}]
00263 }
00264 
00265 ret  ::fileutil::magic::cgen::switchNSort (type tree , type n1 , type n2) {
00266     return [expr {[$tree get $n1 val] - [$tree get $n1 val]}]
00267 }
00268 
00269 ret  ::fileutil::magic::cgen::optNum (type tree , type node) {
00270     array set offsets {}
00271 
00272     # traverse each numeric element of this node's children,
00273     # categorising them
00274 
00275     set kids [$tree children $node]
00276     foreach child $kids {
00277     optNum $tree $child
00278     }
00279 
00280     set numerics [$tree children $node filter ::fileutil::magic::cgen::isNum]
00281     #puts stderr "optNum: $node: $numerics"
00282     if {[llength $numerics] < 2} {
00283     return
00284     }
00285 
00286     foreach el $numerics {
00287     if {[$tree get $el comp] ne "=="} {
00288         continue
00289     }
00290     lappend offsets([$tree get $el type],[$tree get $el offset],[$tree get $el qual]) $el
00291     }
00292 
00293     #puts "Offset: stderr [array get offsets]"
00294     foreach {match nodes} [array get offsets] {
00295     if {[llength $nodes] < 2} {
00296         continue
00297     }
00298 
00299     catch {unset matcher}
00300     foreach n $nodes {
00301         set nv [expr [$tree get $n val]]
00302         if {[info exists matcher($nv)]} {
00303         puts stderr "*====================================="
00304         puts stderr "* Node         <[$tree getall $n]>"
00305         puts stderr "* clashes with <[$tree getall $matcher($nv)]>"
00306         puts stderr "*====================================="
00307         } else {
00308         set matcher($nv) $n
00309         }
00310     }
00311 
00312     foreach {type offset qual} [split $match ,] break
00313     set switch [$tree insert $node [$tree index [lindex $nodes 0]]]
00314     $tree set $switch otype   Switch
00315     $tree set $switch message $match
00316     $tree set $switch offset  $offset
00317     $tree set $switch type    $type
00318     $tree set $switch qual    $qual
00319 
00320     set nodes [lsort -command [list ::fileutil::magic::cgen::switchNSort $tree] $nodes]
00321 
00322     eval [linsert $nodes 0 $tree move $switch end]
00323     # 8.5 # $tree move $switch end {*}$nodes
00324     set     path [$tree get [$tree parent $switch] path]
00325     lappend path [$tree index $switch]
00326     $tree set $switch path $path
00327     }
00328 }
00329 
00330 ret  ::fileutil::magic::cgen::Offsets (type tree) {
00331 
00332     # Indicator if a node has to save field location information for
00333     # relative addressing. The 'kill' attribute is an accumulated
00334     # 'save' over the whole subtree. It will be used to determine when
00335     # level information was destroyed by subnodes and has to be
00336     # regenerated at the current level.
00337 
00338     $tree walk root -type dfs node {
00339     $tree set $node save 0
00340     $tree set $node kill 0
00341     }
00342 
00343     # We walk from the leafs up to the root, synthesizing the data
00344     # needed, as we go.
00345     $tree walk root -type dfs -order post node {
00346     if {$node eq "root"} continue
00347     DecodeOffset $tree $node [$tree get $node offset]
00348 
00349     # If the current node's parent is a switch, and the node has
00350     # to save, then the switch has to save. Because the current
00351     # node is not relevant during code generation anymore, the
00352     # switch is.
00353 
00354     if {[$tree get $node save]} {
00355         # We save, therefore we kill.
00356         $tree set $node kill 1
00357         if {[$tree get [$tree parent $node] otype] eq "Switch"} {
00358         $tree set [$tree parent $node] save 1
00359         }
00360     } else {
00361         # We don't save i.e. kill, but we may inherit it from
00362         # children which kill.
00363 
00364         foreach c [$tree children $node] {
00365         if {[$tree get $c kill]} {
00366             $tree set $node kill 1
00367             break
00368         }
00369         }
00370     }
00371     }
00372 }
00373 
00374 ret  ::fileutil::magic::cgen::DecodeOffset (type tree , type node , type offset) {
00375     if {[string match "(*)" $offset]} {
00376     # Indirection offset. (Decoding is non-trivial, therefore
00377     # packed into a proc).
00378 
00379     set ind 1 ; # Indirect location
00380     foreach {rel base itype idelta} [DecodeIndirectOffset $offset] break
00381 
00382     } elseif {[string match "&*" $offset]} {
00383     # Direct relative offset. (Decoding is trivial)
00384 
00385     set ind    0       ; # Direct location
00386     set rel    1       ; # Relative
00387     set base   [string range $offset 1 end] ; # Base Delta
00388     set itype  {}      ; # No data for indirect
00389     set idelta {}      ; # s.a.
00390 
00391     } else {
00392     set ind    0       ; # Direct location
00393     set rel    0       ; # Absolute
00394     set base   $offset ; # Here!
00395     set itype  {}      ; # No data for indirect
00396     set idelta {}      ; # s.a.
00397     }
00398 
00399     # Store the expanded data back into the tree.
00400 
00401     foreach v {ind rel base itype idelta} {
00402     $tree set $node $v [set $v]
00403     }
00404 
00405     # For nodes with adressing relative to last field above the latter
00406     # has to save this information.
00407 
00408     if {$rel} {
00409     $tree set [$tree parent $node] save 1
00410     }
00411     return
00412 }
00413 
00414 ret  ::fileutil::magic::cgen::DecodeIndirectOffset (type offset) {
00415     variable otmap ; # Offset typemap.
00416 
00417     # Offset parser.
00418     # Syntax:
00419     #   ( ?&? number ?.[bslBSL]? ?[+-]? ?number? )
00420 
00421     set n {(([0-9]+)|(0x[0-9A-Fa-f]+))}
00422     set o "\\((&?)(${n})((\\.\[bslBSL])?)(\[+-]?)(${n}?)\\)"
00423     #         |   | ||| ||               |       | |||
00424     #         1   2 345 67               8       9 012
00425     #         ^   ^     ^                ^       ^
00426     #         rel base  type             sign    index
00427     #
00428     #                            1   2    3 4 5 6    7 8    9   0 1 2
00429     set ok [regexp $o $offset -> rel base _ _ _ type _ sign idx _ _ _]
00430 
00431     if {!$ok} {
00432         return -code error "Bad offset \"$offset\""
00433     }
00434 
00435     # rel is in {"", &}, map to 0|1
00436     if {$rel eq ""} {set rel 0} else {set rel 1}
00437 
00438     # base is a number, enforce decimal. Not optional.
00439     set base [expr $base]
00440 
00441     # Type is in .b .s .l .B .S .L, and "". Map to a regular magic
00442     # type code.
00443     set type $otmap($type)
00444 
00445     # sign is in {+,-,""}. Map to -|"" (Becomes sign of index)
00446     if {$sign eq "+"} {set sign ""}
00447 
00448     # Index is optional number. Enforce decimal, empty is zero. Add in
00449     # the sign as well for a proper signed index.
00450 
00451     if {$idx eq ""} {set idx 0}
00452     set idx $sign[expr $idx]
00453 
00454     return [list $rel $base $type $idx]
00455 }
00456 
00457 ret  ::fileutil::magic::cgen::treedump (type tree) {
00458     set result ""
00459     $tree walk root -type dfs node {
00460     set path  [$tree get $node path]
00461     set depth [llength $path]
00462 
00463     append result [string repeat "  " $depth] [list $path] ": " [$tree get $node type]:
00464 
00465     if {[$tree keyexists $node offset]} {
00466         append result " ,O|[$tree get $node offset]|"
00467 
00468         set x {}
00469         foreach v {ind rel base itype idelta} {lappend x [$tree get $node $v]}
00470         append result "=<[join $x !]>"
00471     }
00472     if {[$tree keyexists $node qual]} {
00473         set q [$tree get $node qual]
00474         if {$q ne ""} {
00475         append result " ,q/$q/"
00476         }
00477     }
00478 
00479     if {[$tree keyexists $node comp]} {
00480         append result " " C([$tree get $node comp])
00481     }
00482     if {[$tree keyexists $node val]} {
00483         append result " " V([$tree get $node val])
00484     }
00485 
00486     if {[$tree keyexists $node otype]} {
00487         append result " " [$tree get $node otype]/[$tree get $node save]
00488     }
00489 
00490     if {$depth == 1} {
00491         set msg [$tree get $node message]
00492         set n $node
00493         while {($n != {}) && ($msg == "")} {
00494         set n [lindex [$tree children $n] 0]
00495         if {$n != {}} {
00496             set msg [$tree get $n message]
00497         }
00498         }
00499         append result " " ( $msg )
00500         if {[$tree keyexists $node file]} {
00501         append result " - " [$tree get $node file]
00502         }
00503     }
00504 
00505     #append result " <" [$tree getall $node] >
00506     append result \n
00507     }
00508     return $result
00509 }
00510 
00511 ret  ::fileutil::magic::cgen::treegen (type tree , type node) {
00512     return "[treegen1 $tree $node]\nresult\n"
00513 }
00514 
00515 ret  ::fileutil::magic::cgen::treegen1 (type tree , type node) {
00516     variable ::fileutil::magic::rt::typemap
00517 
00518     set result ""
00519     foreach k {otype type offset comp val qual message save path} {
00520     if {[$tree keyexists $node $k]} {
00521         set $k [$tree get $node $k]
00522     }
00523     }
00524 
00525     set level [llength $path]
00526 
00527     # Generate code for each node per its type.
00528 
00529     switch $otype {
00530     N -
00531     S {
00532         if {$save} {
00533         # We have to save field data for relative adressing under this
00534         # leaf.
00535         if {$otype eq "N"} {
00536             set type [list Nx $level $type]
00537         } elseif {$otype eq "S"} {
00538             set type [list Sx $level]
00539         }
00540         } else {
00541         # Regular fetching of information.
00542         if {$otype eq "N"} {
00543             set type [list N $type]
00544         } elseif {$otype eq "S"} {
00545             set type S
00546         }
00547         }
00548 
00549         set offset [GenerateOffset $tree $node]
00550 
00551         if {$qual eq ""} {
00552         append result "if \{\[$type $offset $comp [list $val]\]\} \{"
00553         } else {
00554         append result "if \{\[$type $offset $comp [list $val] $qual\]\} \{"
00555         }
00556 
00557         if {[$tree isleaf $node]} {
00558         if {$message ne ""} {
00559             append result "emit [list $message]"
00560         } else {
00561             append result "emit [$tree get $node path]"
00562         }
00563         } else {
00564         # If we saved data the child branches may destroy
00565         # level information. We regenerate it if needed.
00566 
00567         if {$message ne ""} {
00568             append result "emit [list $message]\n"
00569         }
00570 
00571         set killed 0
00572         foreach child [$tree children $node] {
00573             if {$save && $killed && [$tree get $child rel]} {
00574             # This location already does not regenerate if
00575             # the killing subnode was last. We also do not
00576             # need to regenerate if the current subnode
00577             # does not use relative adressing.
00578             append result "L $level;"
00579             set killed 0
00580             }
00581             append result [treegen1 $tree $child]
00582             set killed [expr {$killed || [$tree get $child kill]}]
00583         }
00584         #append result "\nreturn \$result"
00585         }
00586 
00587         append result "\}\n"
00588     }
00589     Root {
00590         foreach child [$tree children $node] {
00591         append result [treegen1 $tree $child]
00592         }
00593     }
00594     Switch {
00595         set offset [GenerateOffset $tree $node]
00596 
00597         if {$save} {
00598         set fetch "Nvx $level"
00599         } else {
00600         set fetch Nv
00601         }
00602 
00603         append fetch " " $type " " $offset
00604         if {$qual ne ""} {
00605         append fetch " " $qual
00606         }
00607         append result "switch -- \[$fetch\] "
00608 
00609         set scan [lindex $typemap($type) 1]
00610 
00611         set ckilled 0
00612         foreach child [$tree children $node] {
00613         binary scan [binary format $scan [$tree get $child val]] $scan val
00614         append result "$val \{"
00615 
00616         if {$save && $ckilled} {
00617             # This location already does not regenerate if
00618             # the killing subnode was last. We also do not
00619             # need to regenerate if the current subnode
00620             # does not use relative adressing.
00621             append result "L $level;"
00622             set ckilled 0
00623         }
00624 
00625         if {[$tree isleaf $child]} {
00626             append result "emit [list [$tree get $child message]]"
00627         } else {
00628             set killed 0
00629             append result "emit [list [$tree get $child message]]\n"
00630             foreach grandchild [$tree children $child] {
00631             if {$save && $killed && [$tree get $grandchild rel]} {
00632                 # This location already does not regenerate if
00633                 # the killing subnode was last. We also do not
00634                 # need to regenerate if the current subnode
00635                 # does not use relative adressing.
00636                 append result "L $level;"
00637                 set killed 0
00638             }
00639             append result [treegen1 $tree $grandchild]
00640             set killed [expr {$killed || [$tree get $grandchild kill]}]
00641             }
00642         }
00643 
00644         set ckilled [expr {$ckilled || [$tree get $child kill]}]
00645         append result "\} "
00646         }
00647         append result "\n"
00648     }
00649     }
00650     return $result
00651 }
00652 
00653 proc ::fileutil::magic::cgen::GenerateOffset {tree node} {
00654     # Examples:
00655     # direct absolute:     45      -> 45
00656     # direct relative:    &45      -> [R 45]
00657     # indirect absolute:  (45.s+1) -> [I 45 s 1]
00658     # indirect relative: (&45.s+1) -> [I [R 45] s 1]
00659 
00660     foreach v {ind rel base itype idelta} {
00661     set $v [$tree get $node $v]
00662     }
00663 
00664     if {$rel} {set base "\[R $base\]"}
00665     if {$ind} {set base "\[I $base $itype $idelta\]"}
00666     return $base
00667 }
00668 
00669 # ### ### ### ######### ######### #########
00670 ## Ready for use.
00671 # EOF
00672 

Generated on 21 Sep 2010 for Gui by  doxygen 1.6.1