tclparser-8.1.tcl

Go to the documentation of this file.
00001 /*  tclparser-8.1.tcl --*/
00002 /* */
00003 /*  This file provides a Tcl implementation of a XML parser.*/
00004 /*  This file supports Tcl 8.1.*/
00005 /* */
00006 /*  See xml-8.[01].tcl for definitions of character sets and*/
00007 /*  regular expressions.*/
00008 /* */
00009 /*  Copyright (c) 2005-2008 by Explain.*/
00010 /*  http://www.explain.com.au/*/
00011 /*  Copyright (c) 1998-2003 Zveno Pty Ltd*/
00012 /*  http://www.zveno.com/*/
00013 /*  */
00014 /*  See the file "LICENSE" in this distribution for information on usage and*/
00015 /*  redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES.*/
00016 /* */
00017 /*  $Id: tclparser-8.1.tcl,v 1.26.2.1 2005/12/28 06:49:51 balls Exp $*/
00018 
00019 package require Tcl 8.1
00020 
00021 package provide xml::tclparser 3.2
00022 
00023 package require xmldefs 3.2
00024 
00025 package require sgmlparser 1.0
00026 
00027 namespace xml::tclparser {
00028 
00029     namespace export create createexternal externalentity parse configure get delete
00030 
00031     /*  Tokenising expressions*/
00032 
00033     variable tokExpr $::xml::tokExpr
00034     variable substExpr $::xml::substExpr
00035 
00036     /*  Register this parser class*/
00037 
00038     ::xml::parserclass create tcl \
00039         -createcommand [namespace code create] \
00040         -createentityparsercommand [namespace code createentityparser] \
00041         -parsecommand [namespace code parse] \
00042         -configurecommand [namespace code configure] \
00043         -deletecommand [namespace code delete] \
00044         -recommand =  [namespace code re]
00045 }
00046 
00047 # xml = ::tclparser::create --
00048 /* */
00049 /*  Creates XML parser object.*/
00050 /* */
00051 /*  Arguments:*/
00052 /*  name    unique identifier for this instance*/
00053 /* */
00054 /*  Results:*/
00055 /*  The state variable is initialised.*/
00056 
00057 ret  xml::tclparser::create name (
00058 
00059     # type Initialise , type state , type variable
00060     , type upvar \#0 [, type namespace , type current]::$, type name , type parser
00061     , type array , type set , type parser [, type list -, type name $, type name            \
00062     -, type cmd [, type uplevel 3 , type namespace , type current]::$, type name    \
00063     -, type final 1                 \
00064     -, type validate 0                  \
00065     -, type statevariable [, type namespace , type current]::$, type name   \
00066     -, type baseuri , optional                  \
00067     , type internaldtd , optional                   \
00068     , type entities [, type namespace , type current]::, type Entities$, type name  \
00069     , type extentities [, type namespace , type current]::, type ExtEntities$, type name    \
00070     , type parameterentities [, type namespace , type current]::, type PEntities$, type name    \
00071     , type externalparameterentities [, type namespace , type current]::, type ExtPEntities$, type name \
00072     , type elementdecls [, type namespace , type current]::, type ElDecls$, type name   \
00073     , type attlistdecls [, type namespace , type current]::, type AttlistDecls$, type name  \
00074     , type notationdecls [, type namespace , type current]::, type NotDecls$, type name \
00075     , type depth 0                      \
00076     , type leftover , optional                                      \
00077     ]
00078 
00079     # , type Initialise , type entities , type with , type predefined , type set
00080     , type array , type set [, type namespace , type current]::, type Entities$, type name [, type array , type get ::, type sgml::, type EntityPredef]
00081 
00082     , type return $, type parser(-, type cmd)
00083 )
00084 
00085 # xml::tclparser::createentityparser --
00086 #
00087 #   Creates XML parser object for an entity.
00088 #
00089 # Arguments:
00090 #   name    name for the new parser
00091 #   parent  name of parent parser
00092 #
00093 # Results:
00094 #   The state variable is initialised.
00095 
00096 proc xml::tclparser::createentityparser {parent name} {
00097     upvar /* 0 [namespace current]::$parent p*/
00098 
00099     /*  Initialise state variable*/
00100     upvar \/* 0 [namespace current]::$name external*/
00101     array  external =  [array get p]
00102 
00103     regsub $parent $p(-cmd) {} parentns
00104 
00105     array  external =  [list -name $name        \
00106     -cmd $parentns$name             \
00107     -statevariable [namespace current]::$name   \
00108     internaldtd {}                  \
00109     line 0                      \
00110     ]
00111     incr external(depth)
00112 
00113     return $external(-cmd)
00114 }
00115 
00116 /*  xml::tclparser::configure --*/
00117 /* */
00118 /*  Configures a XML parser object.*/
00119 /* */
00120 /*  Arguments:*/
00121 /*  name    unique identifier for this instance*/
00122 /*  args    option name/value pairs*/
00123 /* */
00124 /*  Results:*/
00125 /*  May change values of config options*/
00126 
00127 ret  xml::tclparser::configure (type name , type args) {
00128     upvar \#0 [namespace current]::$name parser
00129 
00130     # BUG: very crude, no checks for illegal args
00131     # Mats: Should be synced with sgmlparser.tcl
00132     set options {-elementstartcommand -elementendcommand \
00133       -characterdatacommand -processinginstructioncommand \
00134       -externalentitycommand -xmldeclcommand \
00135       -doctypecommand -commentcommand \
00136       -entitydeclcommand -unparsedentitydeclcommand \
00137       -parameterentitydeclcommand -notationdeclcommand \
00138       -elementdeclcommand -attlistdeclcommand \
00139       -paramentityparsing -defaultexpandinternalentities \
00140       -startdoctypedeclcommand -enddoctypedeclcommand \
00141       -entityreferencecommand -warningcommand \
00142       -defaultcommand -unknownencodingcommand -notstandalonecommand \
00143       -startcdatasectioncommand -endcdatasectioncommand \
00144       -errorcommand -final \
00145       -validate -baseuri -baseurl \
00146       -name -cmd -emptyelement \
00147       -parseattributelistcommand -parseentitydeclcommand \
00148       -normalize -internaldtd -dtdsubset \
00149       -reportempty -ignorewhitespace \
00150       -reportempty \
00151     }
00152     set usage [join $options ", "]
00153     regsub -all -- - $options {} options
00154     set pat ^-([join $options |])$
00155     foreach {flag value} $args {
00156     if {[regexp $pat $flag]} {
00157         # Validate numbers
00158         if {[info exists parser($flag)] && \
00159             [string is integer -strict $parser($flag)] && \
00160             ![string is integer -strict $value]} {
00161         return -code error "Bad value for $flag ($value), must be integer"
00162         }
00163         set parser($flag) $value
00164     } else {
00165         return -code error "Unknown option $flag, can be: $usage"
00166     }
00167     }
00168 
00169     # Backward-compatibility: -baseuri is a synonym for -baseurl
00170     catch {set parser(-baseuri) $parser(-baseurl)}
00171 
00172     return {}
00173 }
00174 
00175 /*  xml::tclparser::parse --*/
00176 /* */
00177 /*  Parses document instance data*/
00178 /* */
00179 /*  Arguments:*/
00180 /*  name    parser object*/
00181 /*  xml data*/
00182 /*  args    configuration options*/
00183 /* */
00184 /*  Results:*/
00185 /*  Callbacks are invoked*/
00186 
00187 ret  xml::tclparser::parse (type name , type xml , type args) {
00188 
00189     array set options $args
00190     upvar \#0 [namespace current]::$name parser
00191     variable tokExpr
00192     variable substExpr
00193 
00194     # Mats:
00195     if {[llength $args]} {
00196     eval {configure $name} $args
00197     }
00198 
00199     set parseOptions [list \
00200         -emptyelement [namespace code ParseEmpty] \
00201         -parseattributelistcommand [namespace code ParseAttrs] \
00202         -parseentitydeclcommand [namespace code ParseEntity] \
00203         -normalize 0]
00204     eval lappend parseOptions \
00205         [array get parser -*command] \
00206         [array get parser -reportempty] \
00207         [array get parser -ignorewhitespace] \
00208         [array get parser -name] \
00209         [array get parser -cmd] \
00210         [array get parser -baseuri] \
00211         [array get parser -validate] \
00212         [array get parser -final] \
00213         [array get parser -defaultexpandinternalentities] \
00214         [array get parser entities] \
00215         [array get parser extentities] \
00216         [array get parser parameterentities] \
00217         [array get parser externalparameterentities] \
00218         [array get parser elementdecls] \
00219         [array get parser attlistdecls] \
00220         [array get parser notationdecls]
00221 
00222     # Mats:
00223     # If -final 0 we also need to maintain the state with a -statevariable !
00224     if {!$parser(-final)} {
00225     eval lappend parseOptions [array get parser -statevariable]
00226     }
00227 
00228     set dtdsubset no
00229     catch {set dtdsubset $options(-dtdsubset)}
00230     switch -- $dtdsubset {
00231     internal {
00232         # Bypass normal parsing
00233         lappend parseOptions -statevariable $parser(-statevariable)
00234         array set intOptions [array get ::sgml::StdOptions]
00235         array set intOptions $parseOptions
00236         ::sgml::ParseDTD:Internal [array get intOptions] $xml
00237         return {}
00238     }
00239     external {
00240         # Bypass normal parsing
00241         lappend parseOptions -statevariable $parser(-statevariable)
00242         array set intOptions [array get ::sgml::StdOptions]
00243         array set intOptions $parseOptions
00244         ::sgml::ParseDTD:External [array get intOptions] $xml
00245         return {}
00246     }
00247     default {
00248         # Pass through to normal processing
00249     }
00250     }
00251 
00252     lappend tokenOptions  \
00253       -internaldtdvariable [namespace current]::${name}(internaldtd)
00254     
00255     # Mats: If -final 0 we also need to maintain the state with a -statevariable !
00256     if {!$parser(-final)} {
00257     eval lappend tokenOptions [array get parser -statevariable] \
00258       [array get parser -final]
00259     }
00260     
00261     # Mats:
00262     # Why not the first four? Just padding? Lrange undos \n interp.
00263     # It is necessary to have the first four as well if chopped off in
00264     # middle of pcdata.
00265     set tokenised [lrange \
00266         [eval {::sgml::tokenise $xml $tokExpr $substExpr} $tokenOptions] \
00267     0 end]
00268 
00269     lappend parseOptions -internaldtd [list $parser(internaldtd)]
00270     eval ::sgml::parseEvent [list $tokenised] $parseOptions
00271 
00272     return {}
00273 }
00274 
00275 /*  xml::tclparser::ParseEmpty --  Tcl 8.1+ version*/
00276 /* */
00277 /*  Used by parser to determine whether an element is empty.*/
00278 /*  This is usually dead easy in XML, but as always not quite.*/
00279 /*  Have to watch out for empty element syntax*/
00280 /* */
00281 /*  Arguments:*/
00282 /*  tag element name*/
00283 /*  attr    attribute list (raw)*/
00284 /*  e   End tag delimiter.*/
00285 /* */
00286 /*  Results:*/
00287 /*  Return value of e*/
00288 
00289 ret  xml::tclparser::ParseEmpty (type tag , type attr , type e) {
00290     switch -glob [string length $e],[regexp "/[::xml::cl $::xml::Wsp]*$" $attr] {
00291     0,0 {
00292         return {}
00293     }
00294     0,* {
00295         return /
00296     }
00297     default {
00298         return $e
00299     }
00300     }
00301 }
00302 
00303 /*  xml::tclparser::ParseAttrs -- Tcl 8.1+ version*/
00304 /* */
00305 /*  Parse element attributes.*/
00306 /* */
00307 /*  There are two forms for name-value pairs:*/
00308 /* */
00309 /*  name="value"*/
00310 /*  name='value'*/
00311 /* */
00312 /*  Arguments:*/
00313 /*  opts    parser options*/
00314 /*  attrs   attribute string given in a tag*/
00315 /* */
00316 /*  Results:*/
00317 /*  Returns a Tcl list representing the name-value pairs in the */
00318 /*  attribute string*/
00319 /* */
00320 /*  A ">" occurring in the attribute list causes problems when parsing*/
00321 /*  the XML.  This manifests itself by an unterminated attribute value*/
00322 /*  and a ">" appearing the element text.*/
00323 /*  In this case return a three element list;*/
00324 /*  the message "unterminated attribute value", the attribute list it*/
00325 /*  did manage to parse and the remainder of the attribute list.*/
00326 
00327 ret  xml::tclparser::ParseAttrs (type opts , type attrs) {
00328 
00329     set result {}
00330 
00331     while {[string length [string trim $attrs]]} {
00332     if {[regexp [::sgml::cl $::xml::Wsp]*($::xml::Name)[::sgml::cl $::xml::Wsp]*=[::sgml::cl $::xml::Wsp]*("|')([::sgml::cl ^<]*?)\\2(.*) $attrs discard attrName delimiter value attrs]} {
00333         lappend result $attrName [NormalizeAttValue $opts $value]
00334     } elseif {[regexp [::sgml::cl $::xml::Wsp]*$::xml::Name[::sgml::cl $::xml::Wsp]*=[::sgml::cl $::xml::Wsp]*("|')[::sgml::cl ^<]*\$ $attrs]} {
00335         return -code error [list {unterminated attribute value} $result $attrs]
00336     } else {
00337         return -code error "invalid attribute list"
00338     }
00339     }
00340 
00341     return $result
00342 }
00343 
00344 /*  xml::tclparser::NormalizeAttValue --*/
00345 /* */
00346 /*  Perform attribute value normalisation.  This involves:*/
00347 /*  . character references are appended to the value*/
00348 /*  . entity references are recursively processed and replacement value appended*/
00349 /*  . whitespace characters cause a space to be appended*/
00350 /*  . other characters appended as-is*/
00351 /* */
00352 /*  Arguments:*/
00353 /*  opts    parser options*/
00354 /*  value   unparsed attribute value*/
00355 /* */
00356 /*  Results:*/
00357 /*  Normalised value returned.*/
00358 
00359 ret  xml::tclparser::NormalizeAttValue (type opts , type value) {
00360 
00361     # sgmlparser already has backslashes protected
00362     # Protect Tcl specials
00363     regsub -all {([][$])} $value {\\\1} value
00364 
00365     # Deal with white space
00366     regsub -all "\[$::xml::Wsp\]" $value { } value
00367 
00368     # Find entity refs
00369     regsub -all {&([^;]+);} $value {[NormalizeAttValue:DeRef $opts {\1}]} value
00370 
00371     return [subst $value]
00372 }
00373 
00374 /*  xml::tclparser::NormalizeAttValue:DeRef --*/
00375 /* */
00376 /*  Handler to normalize attribute values*/
00377 /* */
00378 /*  Arguments:*/
00379 /*  opts    parser options*/
00380 /*  ref entity reference*/
00381 /* */
00382 /*  Results:*/
00383 /*  Returns character*/
00384 
00385 ret  xml::tclparser::NormalizeAttValue:DeRef (type opts , type ref) {
00386     # SRB: Bug fix 2008-11-18 #812051: surround case labels in braces for compatibility with Freewrap
00387     switch -glob -- $ref {
00388     {#x*} {
00389         scan [string range $ref 2 end] %x value
00390         set char [format %c $value]
00391         # Check that the char is legal for XML
00392         if {[regexp [format {^[%s]$} $::xml::Char] $char]} {
00393         return $char
00394         } else {
00395         return -code error "illegal character"
00396         }
00397     }
00398     {#*} {
00399         scan [string range $ref 1 end] %d value
00400         set char [format %c $value]
00401         # Check that the char is legal for XML
00402         if {[regexp [format {^[%s]$} $::xml::Char] $char]} {
00403         return $char
00404         } else {
00405         return -code error "illegal character"
00406         }
00407     }
00408     lt -
00409     gt -
00410     amp -
00411     quot -
00412     apos {
00413         array set map {lt < gt > amp & quot \" apos '}
00414         return $map($ref)
00415     }
00416     default {
00417         # A general entity.  Must resolve to a text value - no element structure.
00418 
00419         array set options $opts
00420         upvar #0 $options(entities) map
00421 
00422         if {[info exists map($ref)]} {
00423 
00424         if {[regexp < $map($ref)]} {
00425             return -code error "illegal character \"<\" in attribute value"
00426         }
00427 
00428         if {![regexp & $map($ref)]} {
00429             # Simple text replacement
00430             return $map($ref)
00431         }
00432 
00433         # There are entity references in the replacement text.
00434         # Can't use child entity parser since must catch element structures
00435 
00436         return [NormalizeAttValue $opts $map($ref)]
00437 
00438         } elseif {[string compare $options(-entityreferencecommand) "::sgml::noop"]} {
00439 
00440         set result [uplevel #0 $options(-entityreferencecommand) [list $ref]]
00441 
00442         return $result
00443 
00444         } else {
00445         return -code error "unable to resolve entity reference \"$ref\""
00446         }
00447     }
00448     }
00449 }
00450 
00451 /*  xml::tclparser::ParseEntity --*/
00452 /* */
00453 /*  Parse general entity declaration*/
00454 /* */
00455 /*  Arguments:*/
00456 /*  data    text to parse*/
00457 /* */
00458 /*  Results:*/
00459 /*  Tcl list containing entity declaration*/
00460 
00461 ret  xml::tclparser::ParseEntity data (
00462     type set , type data [, type string , type trim $, type data]
00463     , type if , optional [regexp =$::sgml::ExternalEntityExpr $data =discard type =delimiter1 id1 =discard delimiter2 =id2 optNDATA =ndata] , optional 
00464     switch =$type {
00465         =PUBLIC {
00466         =return [list =external $id2 =$id1 $ndata]
00467         
00468         , type SYSTEM , optional 
00469         return =[list external =$id1 { $, type ndata]
00470         )
00471     }
00472     } elseif {[regexp {^("|')(.*?)\1$} $data discard delimiter value]} {
00473     return [list internal $value]
00474     } else {
00475     return -code error "badly formed entity declaration"
00476     }
00477 }
00478 
00479 # xml::tclparser::delete --
00480 #
00481 #   Destroy parser data
00482 #
00483 # Arguments:
00484 #   name    parser object
00485 #
00486 # Results:
00487 #   Parser data structure destroyed
00488 
00489 proc xml::tclparser::delete name {
00490     upvar \#0 [namespace current]::$name parser
00491     catch {::sgml::ParserDelete $parser(-statevariable)}
00492     catch {unset parser}
00493     return {}
00494 }
00495 
00496 # xml::tclparser::get --
00497 #
00498 #   Retrieve additional information from the parser
00499 #
00500 # Arguments:
00501 #   name    parser object
00502 #   method  info to retrieve
00503 #   args    additional arguments for method
00504 #
00505 # Results:
00506 #   Depends on method
00507 
00508 proc xml::tclparser::get {name method args} {
00509     upvar #0 [namespace current]::$name parser
00510 
00511     switch -- $method {
00512 
00513     elementdecl {
00514         switch [llength $args] {
00515 
00516         0 {
00517             # Return all element declarations
00518             upvar #0 $parser(elementdecls) elements
00519             return [array get elements]
00520         }
00521 
00522         1 {
00523             # Return specific element declaration
00524             upvar #0 $parser(elementdecls) elements
00525             if {[info exists elements([lindex $args 0])]} {
00526             return [array get elements [lindex $args 0]]
00527             } else {
00528             return -code error "element \"[lindex $args 0]\" not declared"
00529             }
00530         }
00531 
00532         default {
00533             return -code error "wrong number of arguments: should be \"elementdecl ?element?\""
00534         }
00535         }
00536     }
00537 
00538     attlist {
00539         if {[llength $args] != 1} {
00540         return -code error "wrong number of arguments: should be \"get attlist element\""
00541         }
00542 
00543         upvar /* 0 $parser(attlistdecls)*/
00544 
00545         return {}
00546     }
00547 
00548     entitydecl {
00549     }
00550 
00551     parameterentitydecl {
00552     }
00553 
00554     notationdecl {
00555     }
00556 
00557     default {
00558         return -code error "unknown ret  \"$method\""
00559     }
00560     }
00561 
00562     return ()
00563 }
00564 
00565 # xml::tclparser::ExternalEntity --
00566 #
00567 #   Resolve and parse external entity
00568 #
00569 # Arguments:
00570 #   name    parser object
00571 #   base    base URL
00572 #   sys system identifier
00573 #   pub public identifier
00574 #
00575 # Results:
00576 #   External entity is fetched and parsed
00577 
00578 proc xml::tclparser::ExternalEntity {name base sys pub} {
00579 }
00580 
00581 # xml::tclparser:: --
00582 #
00583 #   Reset a parser instance, ready to parse another document
00584 #
00585 # Arguments:
00586 #   name    parser object
00587 #
00588 # Results:
00589 #   Variables unset
00590 
00591 proc xml::tclparser::reset {name} {
00592     upvar \#0 [namespace current]::$name parser
00593 
00594     # Has this parser object been properly initialised?
00595     if {![info exists parser] || \
00596         ![info exists parser(-name)]} {
00597     return [create $name]
00598     }
00599 
00600     array  parser =  {
00601     -final 1
00602     depth 0
00603     leftover {}
00604     }
00605 
00606     foreach var {Entities ExtEntities PEntities ExtPEntities ElDecls AttlistDecls NotDecls} {
00607     catch {un [namespace =  current]::${var}$name}
00608     }
00609 
00610     /*  Initialise entities with predefined set*/
00611     array  [namespace =  current]::Entities$name [array get ::sgml::EntityPredef]
00612 
00613     return {}
00614 }
00615 

Generated on 21 Sep 2010 for Gui by  doxygen 1.6.1