htmlparse.tcl

Go to the documentation of this file.
00001 /*  htmlparse.tcl --*/
00002 /* */
00003 /*  This file implements a simple HTML parsing library in Tcl.*/
00004 /*  It may take advantage of parsers coded in C in the future.*/
00005 /* */
00006 /*  The functionality here is a subset of the*/
00007 /* */
00008 /*      Simple HTML display library by Stephen Uhler (stephen.uhler@sun.com)*/
00009 /*      Copyright (c) 1995 by Sun Microsystems*/
00010 /*      Version 0.3 Fri Sep  1 10:47:17 PDT 1995*/
00011 /* */
00012 /*  The main restriction is that all Tk-related code in the above*/
00013 /*  was left out of the code here. It is expected that this code*/
00014 /*  will go into a 'tklib' in the future.*/
00015 /* */
00016 /*  Copyright (c) 2001 by ActiveState Tool Corp.*/
00017 /*  See the file license.terms.*/
00018 
00019 package require Tcl       8.2
00020 package require struct::stack
00021 package require cmdline   1.1
00022 
00023 namespace ::htmlparse {
00024     namespace export        \
00025         parse       \
00026         debugCallback   \
00027         mapEscapes      \
00028         2tree       \
00029         removeVisualFluff   \
00030         removeFormDefs
00031 
00032     /*  Table of escape characters. Maps from their names to the actual*/
00033     /*  character.  See http://htmlhelp.org/reference/html40/entities/*/
00034 
00035     variable namedEntities
00036 
00037     /*  I. Latin-1 Entities (HTML 4.01)*/
00038     array  namedEntities =  {
00039     nbsp \xa0 iexcl \xa1 cent \xa2 pound \xa3 curren \xa4
00040     yen \xa5 brvbar \xa6 sect \xa7 uml \xa8 copy \xa9
00041     ordf \xaa laquo \xab not \xac shy \xad reg \xae
00042     macr \xaf deg \xb0 plusmn \xb1 sup2 \xb2 sup3 \xb3
00043     acute \xb4 micro \xb5 para \xb6 middot \xb7 cedil \xb8
00044     sup1 \xb9 ordm \xba raquo \xbb frac14 \xbc frac12 \xbd
00045     frac34 \xbe iquest \xbf Agrave \xc0 Aacute \xc1 Acirc \xc2
00046     Atilde \xc3 Auml \xc4 Aring \xc5 AElig \xc6 Ccedil \xc7
00047     Egrave \xc8 Eacute \xc9 Ecirc \xca Euml \xcb Igrave \xcc
00048     Iacute \xcd Icirc \xce Iuml \xcf ETH \xd0 Ntilde \xd1
00049     Ograve \xd2 Oacute \xd3 Ocirc \xd4 Otilde \xd5 Ouml \xd6
00050     times \xd7 Oslash \xd8 Ugrave \xd9 Uacute \xda Ucirc \xdb
00051     Uuml \xdc Yacute \xdd THORN \xde szlig \xdf agrave \xe0
00052     aacute \xe1 acirc \xe2 atilde \xe3 auml \xe4 aring \xe5
00053     aelig \xe6 ccedil \xe7 egrave \xe8 eacute \xe9 ecirc \xea
00054     euml \xeb igrave \xec iacute \xed icirc \xee iuml \xef
00055     eth \xf0 ntilde \xf1 ograve \xf2 oacute \xf3 ocirc \xf4
00056     otilde \xf5 ouml \xf6 divide \xf7 oslash \xf8 ugrave \xf9
00057     uacute \xfa ucirc \xfb uuml \xfc yacute \xfd thorn \xfe
00058     yuml \xff
00059     }
00060 
00061     /*  II. Entities for Symbols and Greek Letters (HTML 4.01)*/
00062     array  namedEntities =  {
00063     fnof \u192 Alpha \u391 Beta \u392 Gamma \u393 Delta \u394
00064     Epsilon \u395 Zeta \u396 Eta \u397 Theta \u398 Iota \u399
00065     Kappa \u39A Lambda \u39B Mu \u39C Nu \u39D Xi \u39E
00066     Omicron \u39F Pi \u3A0 Rho \u3A1 Sigma \u3A3 Tau \u3A4
00067     Upsilon \u3A5 Phi \u3A6 Chi \u3A7 Psi \u3A8 Omega \u3A9
00068     alpha \u3B1 beta \u3B2 gamma \u3B3 delta \u3B4 epsilon \u3B5
00069     zeta \u3B6 eta \u3B7 theta \u3B8 iota \u3B9 kappa \u3BA
00070     lambda \u3BB mu \u3BC nu \u3BD xi \u3BE omicron \u3BF
00071     pi \u3C0 rho \u3C1 sigmaf \u3C2 sigma \u3C3 tau \u3C4
00072     upsilon \u3C5 phi \u3C6 chi \u3C7 psi \u3C8 omega \u3C9
00073     thetasym \u3D1 upsih \u3D2 piv \u3D6 bull \u2022
00074     hellip \u2026 prime \u2032 Prime \u2033 oline \u203E
00075     frasl \u2044 weierp \u2118 image \u2111 real \u211C
00076     trade \u2122 alefsym \u2135 larr \u2190 uarr \u2191
00077     rarr \u2192 darr \u2193 harr \u2194 crarr \u21B5
00078     lArr \u21D0 uArr \u21D1 rArr \u21D2 dArr \u21D3 hArr \u21D4
00079     forall \u2200 part \u2202 exist \u2203 empty \u2205
00080     nabla \u2207 isin \u2208 notin \u2209 ni \u220B prod \u220F
00081     sum \u2211 minus \u2212 lowast \u2217 radic \u221A
00082     prop \u221D infin \u221E ang \u2220 and \u2227 or \u2228
00083     cap \u2229 cup \u222A int \u222B there4 \u2234 sim \u223C
00084     cong \u2245 asymp \u2248 ne \u2260 equiv \u2261 le \u2264
00085     ge \u2265 sub \u2282 sup \u2283 nsub \u2284 sube \u2286
00086     supe \u2287 oplus \u2295 otimes \u2297 perp \u22A5
00087     sdot \u22C5 lceil \u2308 rceil \u2309 lfloor \u230A
00088     rfloor \u230B lang \u2329 rang \u232A loz \u25CA
00089     spades \u2660 clubs \u2663 hearts \u2665 diams \u2666
00090     }
00091 
00092     /*  III. Special Entities (HTML 4.01)*/
00093     array  namedEntities =  {
00094     quot \x22 amp \x26 lt \x3C gt \x3E OElig \u152 oelig \u153
00095     Scaron \u160 scaron \u161 Yuml \u178 circ \u2C6
00096     tilde \u2DC ensp \u2002 emsp \u2003 thinsp \u2009
00097     zwnj \u200C zwj \u200D lrm \u200E rlm \u200F ndash \u2013
00098     mdash \u2014 lsquo \u2018 rsquo \u2019 sbquo \u201A
00099     ldquo \u201C rdquo \u201D bdquo \u201E dagger \u2020
00100     Dagger \u2021 permil \u2030 lsaquo \u2039 rsaquo \u203A
00101     euro \u20AC
00102     }
00103 
00104     /*  Internal cache for the foreach variable-lists and the*/
00105     /*  substitution strings used to split a HTML string into*/
00106     /*  incrementally handleable scripts. This should reduce the*/
00107     /*  time compute this information for repeated calls with the same*/
00108     /*  split-factor. The array is indexed by a combination of the*/
00109     /*  numerical split factor and the length of the command prefix and*/
00110     /*  maps this to a 2-element list containing variable- and*/
00111     /*  subst-string.*/
00112 
00113     variable  splitdata
00114     array  splitdata =  {}
00115 
00116 }
00117 
00118 /*  htmlparse::parse --*/
00119 /* */
00120 /*  This command is the basic parser for HTML. It takes a HTML*/
00121 /*  string, parses it and invokes a command prefix for every tag*/
00122 /*  encountered. It is not necessary for the HTML to be valid for*/
00123 /*  this parser to function. It is the responsibility of the*/
00124 /*  command invoked for every tag to check this. Another*/
00125 /*  responsibility of the invoked command is the handling of tag*/
00126 /*  attributes and character entities (escaped characters). The*/
00127 /*  parser provides the un-interpreted tag attributes to the*/
00128 /*  invoked command to aid in the former, and the package at large*/
00129 /*  provides a helper command, '::htmlparse::mapEscapes', to aid*/
00130 /*  in the handling of the latter. The parser *does* ignore*/
00131 /*  leading DOCTYPE declarations and all valid HTML comments it*/
00132 /*  encounters.*/
00133 /* */
00134 /*  All information beyond the HTML string itself is specified via*/
00135 /*  options, these are explained below.*/
00136 /* */
00137 /*  To help understanding the options some more background*/
00138 /*  information about the parser.*/
00139 /* */
00140 /*  It is capable to detect incomplete tags in the HTML string*/
00141 /*  given to it. Under normal circumstances this will cause the*/
00142 /*  parser to throw an error, but if the option '-incvar' is used*/
00143 /*  to specify a global (or namespace) variable the parser will*/
00144 /*  store the incomplete part of the input into this variable*/
00145 /*  instead. This will aid greatly in the handling of*/
00146 /*  incrementally arriving HTML as the parser will handle whatever*/
00147 /*  he can and defer the handling of the incomplete part until*/
00148 /*  more data has arrived.*/
00149 /* */
00150 /*  Another feature of the parser are its two possible modes of*/
00151 /*  operation. The normal mode is activated if the option '-queue'*/
00152 /*  is not present on the command line invoking the parser. If it*/
00153 /*  is present the parser will go into the incremental mode instead.*/
00154 /* */
00155 /*  The main difference is that a parser in normal mode will*/
00156 /*  immediately invoke the command prefix for each tag it*/
00157 /*  encounters. In incremental mode however the parser will*/
00158 /*  generate a number of scripts which invoke the command prefix*/
00159 /*  for groups of tags in the HTML string and then store these*/
00160 /*  scripts in the specified queue. It is then the responsibility*/
00161 /*  of the caller of the parser to ensure the execution of the*/
00162 /*  scripts in the queue.*/
00163 /* */
00164 /*  Note: The queue objecct given to the parser has to provide the*/
00165 /*  same interface as the queue defined in tcllib -> struct. This*/
00166 /*  does for example mean that all queues created via that part of*/
00167 /*  tcllib can be immediately used here. Still, the queue doesn't*/
00168 /*  have to come from tcllib -> struct as long as the same*/
00169 /*  interface is provided.*/
00170 /* */
00171 /*  In both modes the parser will return an empty string to the*/
00172 /*  caller.*/
00173 /* */
00174 /*  To a parser in incremental mode the option '-split' can be*/
00175 /*  given and will specify the size of the groups he creates. In*/
00176 /*  other words, -split 5 means that each of the generated scripts*/
00177 /*  will invoke the command prefix for 5 consecutive tags in the*/
00178 /*  HTML string. A parser in normal mode will ignore this option*/
00179 /*  and its value.*/
00180 /* */
00181 /*  The option '-vroot' specifies a virtual root tag. A parser in*/
00182 /*  normal mode will invoke the command prefix for it immediately*/
00183 /*  before and after he processes the tags in the HTML, thus*/
00184 /*  simulating that the HTML string is enclosed in a <vroot>*/
00185 /*  </vroot> combination. In incremental mode however the parser*/
00186 /*  is unable to provide the closing virtual root as he never*/
00187 /*  knows when the input is complete. In this case the first*/
00188 /*  script generated by each invocation of the parser will contain*/
00189 /*  an invocation of the command prefix for the virtual root as*/
00190 /*  its first command.*/
00191 /* */
00192 /*  Interface to the command prefix:*/
00193 /* */
00194 /*  In normal mode the parser will invoke the command prefix with*/
00195 /*  for arguments appended. See '::htmlparse::debugCallback' for a*/
00196 /*  description. In incremental mode however the generated scripts*/
00197 /*  will invoke the command prefix with five arguments*/
00198 /*  appended. The last four of these are the same which were*/
00199 /*  mentioned above. The first however is a placeholder string*/
00200 /*  (\win\) for a clientdata value to be supplied later during the*/
00201 /*  actual execution of the generated scripts. This could be a tk*/
00202 /*  window path, for example. This allows the user of this package*/
00203 /*  to preprocess HTML strings without commiting them to a*/
00204 /*  specific window, object, whatever during parsing. This*/
00205 /*  connection can be made later. This also means that it is*/
00206 /*  possible to cache preprocessed HTML. Of course, nothing*/
00207 /*  prevents the user of the parser to replace the placeholder*/
00208 /*  with an empty string.*/
00209 /* */
00210 /*  Arguments:*/
00211 /*  args    An option/value-list followed by the string to*/
00212 /*      parse. Available options are:*/
00213 /* */
00214 /*      -cmd    The command prefix to invoke for every tag in*/
00215 /*          the HTML string. Defaults to*/
00216 /*          '::htmlparse::debugCallback'.*/
00217 /* */
00218 /*      -vroot  The virtual root tag to add around the HTML in*/
00219 /*          normal mode. In incremental mode it is the*/
00220 /*          first tag in each chunk processed by the*/
00221 /*          parser, but there will be no closing tags.*/
00222 /*          Defaults to 'hmstart'.*/
00223 /* */
00224 /*      -split  The size of the groups produced by an*/
00225 /*          incremental mode parser. Ignored when in*/
00226 /*          normal mode. Defaults to 10. Values <= 0 are*/
00227 /*          not allowed.*/
00228 /* */
00229 /*      -incvar The name of the variable where to store any*/
00230 /*          incomplete HTML into. Optional.*/
00231 /* */
00232 /*      -queue*/
00233 /*          The handle/name of the queue objecct to store*/
00234 /*          the generated scripts into. Activates*/
00235 /*          incremental mode. Normal mode is used if this*/
00236 /*          option is not present.*/
00237 /* */
00238 /*      After the option the command explect a single argument*/
00239 /*      containing the HTML string to parse.*/
00240 /* */
00241 /*  Side Effects:*/
00242 /*  In normal mode as of the invoked command. Else none.*/
00243 /* */
00244 /*  Results:*/
00245 /*  None.*/
00246 
00247 ret  ::htmlparse::parse (type args) {
00248     # Convert the HTML string into a evaluable command sequence.
00249 
00250     variable splitdata
00251 
00252     # Option processing, start with the defaults, then run through the
00253     # list of arguments.
00254 
00255     set cmd    ::htmlparse::debugCallback
00256     set vroot  hmstart
00257     set incvar ""
00258     set split  10
00259     set queue  ""
00260 
00261     while {[set err [cmdline::getopt args {cmd.arg vroot.arg incvar.arg split.arg queue.arg} opt arg]]} {
00262     if {$err < 0} {
00263         return -code error "::htmlparse::parse : $arg"
00264     }
00265     switch -exact -- $opt {
00266         cmd    -
00267         vroot  -
00268         incvar -
00269         queue  {
00270         if {[string length $arg] == 0} {
00271             return -code error "::htmlparse::parse : -$opt illegal argument (empty)"
00272         }
00273         # Each option has an variable with the same name associated with it.
00274         # FRINK: nocheck
00275         set $opt $arg
00276         }
00277         split  {
00278         if {$arg <= 0} {
00279             return -code error "::htmlparse::parse : -split illegal argument (<= 0)"
00280         }
00281         set split $arg
00282         }
00283         default {# Can't happen}
00284     }
00285     }
00286 
00287     if {[llength $args] > 1} {
00288     return -code error "::htmlparse::parse : to many arguments behind the options, expected one"
00289     }
00290     if {[llength $args] < 1} {
00291     return -code error "::htmlparse::parse : html string missing"
00292     }
00293 
00294     set html [PrepareHtml [lindex $args 0]]
00295 
00296     # Look for incomplete HTML from the last iteration and prepend it
00297     # to the input we just got.
00298 
00299     if {$incvar != {}} {
00300     upvar $incvar incomplete
00301     } else {
00302     set incomplete ""
00303     }
00304 
00305     if {[catch {set new $incomplete$html}]} {set new $html}
00306     set html $new
00307 
00308     # Handle incomplete HTML (Recognize incomplete tag at end, buffer
00309     # it up for the next call).
00310 
00311     set end [lindex \{$html\} end]
00312     if {[set idx [string last < $end]] > [string last > $end]} {
00313 
00314     if {$incvar == {}} {
00315         return -code error "::htmlparse::parse : HTML is incomplete, option -incvar is missing"
00316     }
00317 
00318     #  upvar $incvar incomplete -- Already done, s.a.
00319     set incomplete [string range $end $idx end]
00320     incr idx -1
00321     set html       [string range $end 0 $idx]
00322     
00323     } else {
00324     set incomplete ""
00325     }
00326 
00327     # Convert the HTML string into a script.
00328 
00329     set sub "\}\n$cmd {\\1} {} {\\2} \{\}\n$cmd {\\1} {/} {} \{"
00330     regsub -all -- {<([^\s>]+)\s*([^>]*)/>} $html $sub html
00331 
00332      sub =  "\}\n$cmd {\\2} {\\1} {\\3} \{"
00333     regsub -all -- {<(/?)([^\s>]+)\s*([^>]*)>} $html $sub html
00334 
00335     /*  The value of queue now determines wether we process the HTML by*/
00336     /*  ourselves (queue is empty) or if we generate a list of  scripts*/
00337     /*  each of which processes n tags, n the argument to -split.*/
00338 
00339     if {$queue == {}} {
00340     /*  And evaluate it. This is the main parsing step.*/
00341 
00342     eval "$cmd {$vroot} {} {} \{$html\}"
00343     eval "$cmd {$vroot} /  {} {}"
00344     } else {
00345     /*  queue defined, generate list of scripts doing small chunks of tags.*/
00346 
00347      lcmd =  [llength $cmd]
00348      key =   $split,$lcmd
00349 
00350     if {![info exists splitdata($key)]} {
00351         for { i =  0;  group =  {}} {$i < $split} {incr i} {
00352         /*  Use the length of the command prefix to generate*/
00353         /*  additional variables before the main variable after*/
00354         /*  which the placeholder will be inserted.*/
00355 
00356         for { j =  1} {$j < $lcmd} {incr j} {
00357             append group "b${j}_$i "
00358         }
00359 
00360         append group "a$i c$i d$i e$i f$i\n"
00361         }
00362         regsub -all -- {(a[0-9]+)}          $group    {{$\1} \\\\win\\\\} subgroup
00363         regsub -all -- {([b-z_0-9]+[0-9]+)} $subgroup {{$\1}}             subgroup
00364 
00365          splitdata = ($key) [list $group $subgroup]
00366     }
00367 
00368     foreach {group subgroup} $splitdata($key) break ; /*  lassign*/
00369     foreach $group "$cmd {$vroot} {} {} \{$html\}" {
00370         $queue put [string trimright [subst $subgroup]]
00371     }
00372     }
00373     return
00374 }
00375 
00376 /*  htmlparse::PrepareHtml --*/
00377 /* */
00378 /*  Internal helper command of '::htmlparse::parse'. Removes*/
00379 /*  leading DOCTYPE declarations and comments, protects the*/
00380 /*  special characters of tcl from evaluation.*/
00381 /* */
00382 /*  Arguments:*/
00383 /*  html    The HTML string to prepare*/
00384 /* */
00385 /*  Side Effects:*/
00386 /*  None.*/
00387 /* */
00388 /*  Results:*/
00389 /*  The provided HTML string with the described modifications*/
00390 /*  applied to it.*/
00391 
00392 ret  ::htmlparse::PrepareHtml (type html) {
00393     # Remove the following items from the text:
00394     # - A leading   <!DOCTYPE...> declaration.
00395     # - All comments    <!-- ... -->
00396     #
00397     # Also normalize the line endings (\r -> \n).
00398 
00399     # Tcllib SF Bug 861287 - Processing of comments.
00400     # Recognize EOC by RE, instead of fixed string.
00401 
00402     set html [string map [list \r \n] $html]
00403 
00404     regsub -- "^.*<!DOCTYPE\[^>\]*>"    $html {}     html
00405     regsub -all -- "--(\[ \t\n\]*)>"      $html "\001\\1\002" html
00406 
00407     # Recognize borken beginnings of a comment and convert them to PCDATA.
00408     regsub -all -- "<--(\[^\001\]*)\001(\[^\002\]*)\002" $html {\&lt;--\1--\2\&gt;} html
00409 
00410     # And now recognize true comments, remove them.
00411     regsub -all -- "<!--\[^\001\]*\001(\[^\002\]*)\002"  $html {}                   html
00412 
00413     # Protect characters special to tcl (braces, slashes) by
00414     # converting them to their escape sequences.
00415 
00416     return [string map [list \
00417             "\{" "&#123;" \
00418             "\}" "&#125;" \
00419             "\\" "&#92;"] $html]
00420 }
00421 
00422 
00423 
00424 /*  htmlparse::debugCallback --*/
00425 /* */
00426 /*  The standard callback used by the parser in*/
00427 /*  '::htmlparse::parse' if none was specified by the user. Simply*/
00428 /*  dumps its arguments to stdout.  This callback can be used for*/
00429 /*  both normal and incremental mode of the calling parser. In*/
00430 /*  other words, it accepts four or five arguments. The last four*/
00431 /*  arguments are described below. The optional fifth argument*/
00432 /*  contains the clientdata value given to the callback by a*/
00433 /*  parser in incremental mode. All callbacks have to follow the*/
00434 /*  signature of this command in the last four arguments, and*/
00435 /*  callbacks used in incremental parsing have to follow this*/
00436 /*  signature in the last five arguments.*/
00437 /* */
00438 /*  Arguments:*/
00439 /*  tag         The name of the tag currently*/
00440 /*              processed by the parser.*/
00441 /* */
00442 /*  slash           Either empty or a slash. Allows us to*/
00443 /*              distinguish between opening (slash is*/
00444 /*              empty) and closing tags (slash is*/
00445 /*              equal to a '/').*/
00446 /* */
00447 /*  param           The un-interpreted list of parameters*/
00448 /*              to the tag.*/
00449 /* */
00450 /*  textBehindTheTag    The text found by the parser behind*/
00451 /*              the tag named in 'tag'.*/
00452 /* */
00453 /*  Side Effects:*/
00454 /*  None.*/
00455 /* */
00456 /*  Results:*/
00457 /*  None.*/
00458 
00459 ret  ::htmlparse::debugCallback (type args) {
00460     # args = ?clientData? tag slash param textBehindTheTag
00461     puts "==> $args"
00462     return
00463 }
00464 
00465 /*  htmlparse::mapEscapes --*/
00466 /* */
00467 /*  Takes a HTML string, substitutes all escape sequences with*/
00468 /*  their actual characters and returns the resulting string.*/
00469 /*  HTML not containing escape sequences or invalid escape*/
00470 /*  sequences is returned unchanged.*/
00471 /* */
00472 /*  Arguments:*/
00473 /*  html    The string to modify*/
00474 /* */
00475 /*  Side Effects:*/
00476 /*  None.*/
00477 /* */
00478 /*  Results:*/
00479 /*  The argument string with all escape sequences replaced with*/
00480 /*  their actual characters.*/
00481 
00482 ret  ::htmlparse::mapEscapes (type html) {
00483     # Find HTML escape characters of the form &xxx(;|EOW)
00484 
00485     # Quote special Tcl chars so they pass through [subst] unharmed.
00486     set new [string map [list \] \\\] \[ \\\[ \$ \\\$ \\ \\\\] $html]
00487     regsub -all -- {&([[:alnum:]]{2,7})(;|\M)} $new {[DoNamedMap \1 {\2}]} new
00488     regsub -all -- {&#([[:digit:]]{1,5})(;|\M)} $new {[DoDecMap \1 {\2}]} new
00489     regsub -all -- {&#x([[:xdigit:]]{1,4})(;|\M)} $new {[DoHexMap \1 {\2}]} new
00490     return [subst $new]
00491 }
00492 
00493 ret  ::htmlparse::DoNamedMap (type name , type endOf) {
00494     variable namedEntities
00495     if {[info exist namedEntities($name)]} {
00496     return $namedEntities($name)
00497     } else {
00498     # Put it back..
00499     return "&$name$endOf"
00500     }
00501 }
00502 
00503 ret  ::htmlparse::DoDecMap (type dec , type endOf) {
00504     scan $dec %d dec
00505     if {$dec <= 0xFFFD} {
00506     return [format %c $dec]
00507     } else {
00508     # Put it back..
00509     return "&#$dec$endOf"
00510     }
00511 }
00512 
00513 ret  ::htmlparse::DoHexMap (type hex , type endOf) {
00514     scan $hex %x value
00515     if {$value <= 0xFFFD} {
00516     return [format %c $value]
00517     } else {
00518     # Put it back..
00519     return "&#x$hex$endOf"
00520     }
00521 }
00522 
00523 /*  htmlparse::2tree --*/
00524 /* */
00525 /*  This command is a wrapper around '::htmlparse::parse' which*/
00526 /*  takes a HTML string and converts it into a tree containing the*/
00527 /*  logical structure of the parsed document. The tree object has*/
00528 /*  to be created by the caller. It is also expected that the tree*/
00529 /*  object provides the same interface as the tree object from*/
00530 /*  tcllib -> struct. It doesn't have to come from that module*/
00531 /*  though. The internal callback does some basic checking of HTML*/
00532 /*  validity and tries to recover from the most basic errors.*/
00533 /* */
00534 /*  Arguments:*/
00535 /*  html    The HTML string to parse and convert.*/
00536 /*  tree    The name of the tree to fill.*/
00537 /* */
00538 /*  Side Effects:*/
00539 /*  Creates a tree object (see tcllib -> struct)*/
00540 /*  and modifies it.*/
00541 /* */
00542 /*  Results:*/
00543 /*  The contents of 'tree'.*/
00544 
00545 ret  ::htmlparse::2tree (type html , type tree) {
00546 
00547     # One internal datastructure is required, a stack of open
00548     # tags. This stack is also provided by the 'struct' module of
00549     # tcllib. As the operation of this command is synchronuous we
00550     # don't have to take care against multiple running copies at the
00551     # same times (Such are possible, but will be in different
00552     # interpreters and true concurrency is possible only if they are
00553     # in different threads too). IOW, no need for tricks to make the
00554     # internal datastructure unique.
00555 
00556     catch {::htmlparse::tags destroy}
00557 
00558     ::struct::stack ::htmlparse::tags
00559     ::htmlparse::tags push root
00560     $tree set root type root
00561 
00562     parse -cmd [list ::htmlparse::2treeCallback $tree] $html
00563 
00564     # A bit hackish, correct the ordering of nodes for the optional
00565     # tag types, over a larger area when was seen by the parser itself.
00566 
00567     $tree walk root -order post n {
00568     ::htmlparse::Reorder $tree $n
00569     }
00570 
00571     ::htmlparse::tags destroy
00572     return $tree
00573 }
00574 
00575 /*  htmlparse::2treeCallback --*/
00576 /* */
00577 /*  Internal helper command. A special callback to*/
00578 /*  '::htmlparse::parse' used by '::htmlparse::2tree' which takes*/
00579 /*  the incoming stream of tags and converts them into a tree*/
00580 /*  representing the inner structure of the parsed HTML*/
00581 /*  document. Recovers from simple HTML errors like missing*/
00582 /*  opening tags, missing closing tags and overlapping tags.*/
00583 /* */
00584 /*  Arguments:*/
00585 /*  tree            The name of the tree to manipulate.*/
00586 /*  tag         See '::htmlparse::debugCallback'.*/
00587 /*  slash           See '::htmlparse::debugCallback'.*/
00588 /*  param           See '::htmlparse::debugCallback'.*/
00589 /*  textBehindTheTag    See '::htmlparse::debugCallback'.*/
00590 /* */
00591 /*  Side Effects:*/
00592 /*  Manipulates the tree object whose name was given as the first*/
00593 /*  argument.*/
00594 /* */
00595 /*  Results:*/
00596 /*  None.*/
00597 
00598 ret  ::htmlparse::2treeCallback (type tree , type tag , type slash , type param , type textBehindTheTag) {
00599     # This could be table-driven I think but for now the switches
00600     # should work fine.
00601 
00602     # Normalize tag information for later comparisons. Also remove
00603     # superfluous whitespace. Don't forget to decode the standard
00604     # entities.
00605 
00606     set  tag  [string tolower $tag]
00607     set  textBehindTheTag [string trim $textBehindTheTag]
00608     if {$textBehindTheTag != {}} {
00609     set text [mapEscapes $textBehindTheTag]
00610     }
00611 
00612     if {"$slash" == "/"} {
00613     # Handle closing tags. Standard operation is to pop the tag
00614     # from the stack of open tags. We don't do this for </p> and
00615     # </li>. As they were optional they were never pushed onto the
00616     # stack (Well, actually they are just popped immediately after
00617     # they were pusheed, see below).
00618 
00619     switch -exact -- $tag {
00620         base - option - meta - li - p {
00621         # Ignore, nothing to do.        
00622         }
00623         default {
00624         # The moment we get a closing tag which does not match
00625         # the tag on the stack we have two possibilities on how
00626         # this came into existence to choose from:
00627         #
00628         # a) A tag is now closed but was never opened.
00629         # b) A tag requiring an end tag was opened but the end
00630         #    tag was omitted and we now are at a tag which was
00631         #    opened before the one with the omitted end tag.
00632 
00633         # NOTE:
00634         # Pages delivered from the amazon.uk site contain both
00635         # cases: </a> without opening, <b> & <font> without
00636         # closing. Another error: <a><b></a></b>, i.e. overlapping
00637         # tags. Fortunately this can be handled by the algorithm
00638         # below, in two cycles, one of which is case (b), followed
00639         # by case (a). It seems as if Amazon/UK believes that visual
00640         # markup like <b> and <font> is an option (switch-on) instead
00641         # of a region.
00642 
00643         # Algorithm used here to deal with these:
00644         # 1) Search whole stack for the matching opening tag.
00645         #    If there is one assume case (b) and pop everything
00646         #    until and including this opening tag.
00647         # 2) If no matching opening tag was found assume case
00648         #    (a) and ignore the tag.
00649         #
00650         # Part (1) also subsumes the normal case, i.e. the
00651         # matching tag is at the top of the stack.
00652 
00653         set nodes [::htmlparse::tags peek [::htmlparse::tags size]]
00654         # Note: First item is top of stack, last item is bottom of stack !
00655         # (This behaviour of tcllib stacks is not documented
00656         # -> we should update the manpage).
00657 
00658         #foreach n $nodes {lappend tstring [p get $n -key type]}
00659         #puts stderr --[join $tstring]--
00660 
00661         set level 1
00662         set found 0
00663         foreach n $nodes {
00664             set type [$tree get $n type]
00665             if {0 == [string compare $tag $type]} {
00666             # Found an earlier open tag -> (b).
00667             set found 1
00668             break
00669             }
00670             incr level
00671         }
00672         if {$found} {
00673             ::htmlparse::tags pop $level
00674             if {$level > 1} {
00675             #foreach n $nodes {lappend tstring [$tree get $n type]}
00676             #puts stderr "\tdesync at <$tag> ($tstring) => pop $level"
00677             }
00678         } else {
00679             #foreach n $nodes {lappend tstring [$tree get $n type]}
00680             #puts stderr "\tdesync at <$tag> ($tstring) => ignore"
00681         }
00682         }
00683     }
00684 
00685     # If there is text behind a closing tag X it belongs to the
00686     # parent tag of X.
00687 
00688     if {$textBehindTheTag != {}} {
00689         # Attach the text behind the closing tag to the reopened
00690         # context.
00691 
00692         set        pcd  [$tree insert [::htmlparse::tags peek] end]
00693         $tree set $pcd  type PCDATA
00694         $tree set $pcd  data $textBehindTheTag
00695     }
00696 
00697     } else {
00698     # Handle opening tags. The standard operation for most is to
00699     # push them onto the stack and thus open a nested context.
00700     # This does not happen for both the optional tags (p, li) and
00701     # the ones which don't have closing tags (meta, br, option,
00702     # input, area, img).
00703     #
00704     # The text coming with the tag will be added after the tag if
00705     # it is a tag without a matching close, else it will be added
00706     # as a node below the tag (as it is the region between the
00707     # opening and closing tag and thus nested inside). Empty text
00708     # is ignored under all circcumstances.
00709 
00710     set        node [$tree insert [::htmlparse::tags peek] end]
00711     $tree set $node type $tag
00712     $tree set $node data $param
00713 
00714     if {$textBehindTheTag != {}} {
00715         switch -exact -- $tag {
00716         input - area - img - br {
00717             set pcd  [$tree insert [::htmlparse::tags peek] end]
00718         }
00719         default {
00720             set pcd  [$tree insert $node end]
00721         }
00722         }
00723         $tree set $pcd type PCDATA
00724         $tree set $pcd data $textBehindTheTag
00725     }
00726 
00727     ::htmlparse::tags push $node
00728 
00729     # Special handling: <p>, <li> may have no closing tag => pop
00730     #                 : them immediately.
00731     #
00732     # Special handling: <meta>, <br>, <option>, <input>, <area>,
00733     #                 : <img>: no closing tags for these.
00734 
00735     switch -exact -- $tag {
00736         hr - base - meta - li - br - option - input - area - img - p - h1 - h2 - h3 - h4 - h5 - h6 {
00737         ::htmlparse::tags pop
00738         }
00739         default {}
00740     }
00741     }
00742 }
00743 
00744 /*  htmlparse::removeVisualFluff --*/
00745 /* */
00746 /*  This command walks a tree as generated by '::htmlparse::2tree'*/
00747 /*  and removes all the nodes which represent visual tags and not*/
00748 /*  structural ones. The purpose of the command is to make the*/
00749 /*  tree easier to navigate without getting bogged down in visual*/
00750 /*  information not relevant to the search.*/
00751 /* */
00752 /*  Arguments:*/
00753 /*  tree    The name of the tree to cut down.*/
00754 /* */
00755 /*  Side Effects:*/
00756 /*  Modifies the specified tree.*/
00757 /* */
00758 /*  Results:*/
00759 /*  None.*/
00760 
00761 ret  ::htmlparse::removeVisualFluff (type tree) {
00762     $tree walk root -order post n {
00763     ::htmlparse::RemoveVisualFluff $tree $n
00764     }
00765     return
00766 }
00767 
00768 /*  htmlparse::removeFormDefs --*/
00769 /* */
00770 /*  Like '::htmlparse::removeVisualFluff' this command is here to*/
00771 /*  cut down on the size of the tree as generated by*/
00772 /*  '::htmlparse::2tree'. It removes all nodes representing forms*/
00773 /*  and form elements.*/
00774 /* */
00775 /*  Arguments:*/
00776 /*  tree    The name of the tree to cut down.*/
00777 /* */
00778 /*  Side Effects:*/
00779 /*  Modifies the specified tree.*/
00780 /* */
00781 /*  Results:*/
00782 /*  None.*/
00783 
00784 ret  ::htmlparse::removeFormDefs (type tree) {
00785     $tree walk root -order post n {
00786     ::htmlparse::RemoveFormDefs $tree $n
00787     }
00788     return
00789 }
00790 
00791 /*  htmlparse::RemoveVisualFluff --*/
00792 /* */
00793 /*  Internal helper command to*/
00794 /*  '::htmlparse::removeVisualFluff'. Does the actual work.*/
00795 /* */
00796 /*  Arguments:*/
00797 /*  tree    The name of the tree currently processed*/
00798 /*  node    The name of the node to look at.*/
00799 /* */
00800 /*  Side Effects:*/
00801 /*  Modifies the specified tree.*/
00802 /* */
00803 /*  Results:*/
00804 /*  None.*/
00805 
00806 ret  ::htmlparse::RemoveVisualFluff (type tree , type node) {
00807     switch -exact -- [$tree get $node type] {
00808     hmstart - html - font - center - div - sup - b - i {
00809         # Removes the node, but does not affect the nodes below
00810         # it. These are just made into chiildren of the parent of
00811         # this node, in its place.
00812 
00813         $tree cut $node
00814     }
00815     script - option - select - meta - map - img {
00816         # Removes this node and everything below it.
00817         $tree delete $node
00818     }
00819     default {
00820         # Ignore tag
00821     }
00822     }
00823 }
00824 
00825 /*  htmlparse::RemoveFormDefs --*/
00826 /* */
00827 /*  Internal helper command to*/
00828 /*  '::htmlparse::removeFormDefs'. Does the actual work.*/
00829 /* */
00830 /*  Arguments:*/
00831 /*  tree    The name of the tree currently processed*/
00832 /*  node    The name of the node to look at.*/
00833 /* */
00834 /*  Side Effects:*/
00835 /*  Modifies the specified tree.*/
00836 /* */
00837 /*  Results:*/
00838 /*  None.*/
00839 
00840 ret  ::htmlparse::RemoveFormDefs (type tree , type node) {
00841     switch -exact -- [$tree get $node type] {
00842     form {
00843         $tree delete $node
00844     }
00845     default {
00846         # Ignore tag
00847     }
00848     }
00849 }
00850 
00851 /*  htmlparse::Reorder --*/
00852 
00853 /*  Internal helper command to '::htmlparse::2tree'. Moves the*/
00854 /*  nodes between p/p, li/li and h<i> sequences below the*/
00855 /*  paragraphs and items. IOW, corrects misconstructions for*/
00856 /*  the optional node types.*/
00857 /* */
00858 /*  Arguments:*/
00859 /*  tree    The name of the tree currently processed*/
00860 /*  node    The name of the node to look at.*/
00861 /* */
00862 /*  Side Effects:*/
00863 /*  Modifies the specified tree.*/
00864 /* */
00865 /*  Results:*/
00866 /*  None.*/
00867 
00868 ret  ::htmlparse::Reorder (type tree , type node) {
00869     switch -exact -- [set tp [$tree get $node type]] {
00870     h1 - h2 - h3 - h4 - h5 - h6 - p - li {
00871         # Look for right siblings until the next node with a
00872         # similar type (or end of level) and move these below this
00873         # node.
00874 
00875         while {1} {
00876         set sibling [$tree next $node]
00877         if {
00878             ($sibling == {}) ||
00879             ([lsearch -exact {h1 h2 h3 h4 h5 h6 p li} [$tree get $sibling type]] != -1)
00880         } {
00881             break
00882         }
00883         $tree move $node end $sibling
00884         }
00885     }
00886     default {
00887         # Ignore tag
00888     }
00889     }
00890 }
00891 
00892 /*  ### ######### ###########################*/
00893 
00894 package provide htmlparse 1.1.2
00895