00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019 package require Tcl 8.2
00020 package require struct::stack
00021 package require cmdline 1.1
00022
00023 namespace ::htmlparse {
00024 namespace export \
00025 parse \
00026 debugCallback \
00027 mapEscapes \
00028 2tree \
00029 removeVisualFluff \
00030 removeFormDefs
00031
00032
00033
00034
00035
00036
00037
00038 array namedEntities = {
00039 nbsp \xa0 iexcl \xa1 cent \xa2 pound \xa3 curren \xa4
00040 yen \xa5 brvbar \xa6 sect \xa7 uml \xa8 copy \xa9
00041 ordf \xaa laquo \xab not \xac shy \xad reg \xae
00042 macr \xaf deg \xb0 plusmn \xb1 sup2 \xb2 sup3 \xb3
00043 acute \xb4 micro \xb5 para \xb6 middot \xb7 cedil \xb8
00044 sup1 \xb9 ordm \xba raquo \xbb frac14 \xbc frac12 \xbd
00045 frac34 \xbe iquest \xbf Agrave \xc0 Aacute \xc1 Acirc \xc2
00046 Atilde \xc3 Auml \xc4 Aring \xc5 AElig \xc6 Ccedil \xc7
00047 Egrave \xc8 Eacute \xc9 Ecirc \xca Euml \xcb Igrave \xcc
00048 Iacute \xcd Icirc \xce Iuml \xcf ETH \xd0 Ntilde \xd1
00049 Ograve \xd2 Oacute \xd3 Ocirc \xd4 Otilde \xd5 Ouml \xd6
00050 times \xd7 Oslash \xd8 Ugrave \xd9 Uacute \xda Ucirc \xdb
00051 Uuml \xdc Yacute \xdd THORN \xde szlig \xdf agrave \xe0
00052 aacute \xe1 acirc \xe2 atilde \xe3 auml \xe4 aring \xe5
00053 aelig \xe6 ccedil \xe7 egrave \xe8 eacute \xe9 ecirc \xea
00054 euml \xeb igrave \xec iacute \xed icirc \xee iuml \xef
00055 eth \xf0 ntilde \xf1 ograve \xf2 oacute \xf3 ocirc \xf4
00056 otilde \xf5 ouml \xf6 divide \xf7 oslash \xf8 ugrave \xf9
00057 uacute \xfa ucirc \xfb uuml \xfc yacute \xfd thorn \xfe
00058 yuml \xff
00059 }
00060
00061
00062 array namedEntities = {
00063 fnof \u192 Alpha \u391 Beta \u392 Gamma \u393 Delta \u394
00064 Epsilon \u395 Zeta \u396 Eta \u397 Theta \u398 Iota \u399
00065 Kappa \u39A Lambda \u39B Mu \u39C Nu \u39D Xi \u39E
00066 Omicron \u39F Pi \u3A0 Rho \u3A1 Sigma \u3A3 Tau \u3A4
00067 Upsilon \u3A5 Phi \u3A6 Chi \u3A7 Psi \u3A8 Omega \u3A9
00068 alpha \u3B1 beta \u3B2 gamma \u3B3 delta \u3B4 epsilon \u3B5
00069 zeta \u3B6 eta \u3B7 theta \u3B8 iota \u3B9 kappa \u3BA
00070 lambda \u3BB mu \u3BC nu \u3BD xi \u3BE omicron \u3BF
00071 pi \u3C0 rho \u3C1 sigmaf \u3C2 sigma \u3C3 tau \u3C4
00072 upsilon \u3C5 phi \u3C6 chi \u3C7 psi \u3C8 omega \u3C9
00073 thetasym \u3D1 upsih \u3D2 piv \u3D6 bull \u2022
00074 hellip \u2026 prime \u2032 Prime \u2033 oline \u203E
00075 frasl \u2044 weierp \u2118 image \u2111 real \u211C
00076 trade \u2122 alefsym \u2135 larr \u2190 uarr \u2191
00077 rarr \u2192 darr \u2193 harr \u2194 crarr \u21B5
00078 lArr \u21D0 uArr \u21D1 rArr \u21D2 dArr \u21D3 hArr \u21D4
00079 forall \u2200 part \u2202 exist \u2203 empty \u2205
00080 nabla \u2207 isin \u2208 notin \u2209 ni \u220B prod \u220F
00081 sum \u2211 minus \u2212 lowast \u2217 radic \u221A
00082 prop \u221D infin \u221E ang \u2220 and \u2227 or \u2228
00083 cap \u2229 cup \u222A int \u222B there4 \u2234 sim \u223C
00084 cong \u2245 asymp \u2248 ne \u2260 equiv \u2261 le \u2264
00085 ge \u2265 sub \u2282 sup \u2283 nsub \u2284 sube \u2286
00086 supe \u2287 oplus \u2295 otimes \u2297 perp \u22A5
00087 sdot \u22C5 lceil \u2308 rceil \u2309 lfloor \u230A
00088 rfloor \u230B lang \u2329 rang \u232A loz \u25CA
00089 spades \u2660 clubs \u2663 hearts \u2665 diams \u2666
00090 }
00091
00092
00093 array namedEntities = {
00094 quot \x22 amp \x26 lt \x3C gt \x3E OElig \u152 oelig \u153
00095 Scaron \u160 scaron \u161 Yuml \u178 circ \u2C6
00096 tilde \u2DC ensp \u2002 emsp \u2003 thinsp \u2009
00097 zwnj \u200C zwj \u200D lrm \u200E rlm \u200F ndash \u2013
00098 mdash \u2014 lsquo \u2018 rsquo \u2019 sbquo \u201A
00099 ldquo \u201C rdquo \u201D bdquo \u201E dagger \u2020
00100 Dagger \u2021 permil \u2030 lsaquo \u2039 rsaquo \u203A
00101 euro \u20AC
00102 }
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113 variable splitdata
00114 array splitdata = {}
00115
00116 }
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150
00151
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208
00209
00210
00211
00212
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230
00231
00232
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243
00244
00245
00246
00247 ret ::htmlparse::parse (type args) {
00248 # Convert the HTML string into a evaluable command sequence.
00249
00250 variable splitdata
00251
00252 # Option processing, start with the defaults, then run through the
00253 # list of arguments.
00254
00255 set cmd ::htmlparse::debugCallback
00256 set vroot hmstart
00257 set incvar ""
00258 set split 10
00259 set queue ""
00260
00261 while {[set err [cmdline::getopt args {cmd.arg vroot.arg incvar.arg split.arg queue.arg} opt arg]]} {
00262 if {$err < 0} {
00263 return -code error "::htmlparse::parse : $arg"
00264 }
00265 switch -exact -- $opt {
00266 cmd -
00267 vroot -
00268 incvar -
00269 queue {
00270 if {[string length $arg] == 0} {
00271 return -code error "::htmlparse::parse : -$opt illegal argument (empty)"
00272 }
00273 # Each option has an variable with the same name associated with it.
00274 # FRINK: nocheck
00275 set $opt $arg
00276 }
00277 split {
00278 if {$arg <= 0} {
00279 return -code error "::htmlparse::parse : -split illegal argument (<= 0)"
00280 }
00281 set split $arg
00282 }
00283 default {# Can't happen}
00284 }
00285 }
00286
00287 if {[llength $args] > 1} {
00288 return -code error "::htmlparse::parse : to many arguments behind the options, expected one"
00289 }
00290 if {[llength $args] < 1} {
00291 return -code error "::htmlparse::parse : html string missing"
00292 }
00293
00294 set html [PrepareHtml [lindex $args 0]]
00295
00296 # Look for incomplete HTML from the last iteration and prepend it
00297 # to the input we just got.
00298
00299 if {$incvar != {}} {
00300 upvar $incvar incomplete
00301 } else {
00302 set incomplete ""
00303 }
00304
00305 if {[catch {set new $incomplete$html}]} {set new $html}
00306 set html $new
00307
00308 # Handle incomplete HTML (Recognize incomplete tag at end, buffer
00309 # it up for the next call).
00310
00311 set end [lindex \{$html\} end]
00312 if {[set idx [string last < $end]] > [string last > $end]} {
00313
00314 if {$incvar == {}} {
00315 return -code error "::htmlparse::parse : HTML is incomplete, option -incvar is missing"
00316 }
00317
00318 # upvar $incvar incomplete -- Already done, s.a.
00319 set incomplete [string range $end $idx end]
00320 incr idx -1
00321 set html [string range $end 0 $idx]
00322
00323 } else {
00324 set incomplete ""
00325 }
00326
00327 # Convert the HTML string into a script.
00328
00329 set sub "\}\n$cmd {\\1} {} {\\2} \{\}\n$cmd {\\1} {/} {} \{"
00330 regsub -all -- {<([^\s>]+)\s*([^>]*)/>} $html $sub html
00331
00332 sub = "\}\n$cmd {\\2} {\\1} {\\3} \{"
00333 regsub -all -- {<(/?)([^\s>]+)\s*([^>]*)>} $html $sub html
00334
00335
00336
00337
00338
00339 if {$queue == {}} {
00340
00341
00342 eval "$cmd {$vroot} {} {} \{$html\}"
00343 eval "$cmd {$vroot} / {} {}"
00344 } else {
00345
00346
00347 lcmd = [llength $cmd]
00348 key = $split,$lcmd
00349
00350 if {![info exists splitdata($key)]} {
00351 for { i = 0; group = {}} {$i < $split} {incr i} {
00352
00353
00354
00355
00356 for { j = 1} {$j < $lcmd} {incr j} {
00357 append group "b${j}_$i "
00358 }
00359
00360 append group "a$i c$i d$i e$i f$i\n"
00361 }
00362 regsub -all -- {(a[0-9]+)} $group {{$\1} \\\\win\\\\} subgroup
00363 regsub -all -- {([b-z_0-9]+[0-9]+)} $subgroup {{$\1}} subgroup
00364
00365 splitdata = ($key) [list $group $subgroup]
00366 }
00367
00368 foreach {group subgroup} $splitdata($key) break ;
00369 foreach $group "$cmd {$vroot} {} {} \{$html\}" {
00370 $queue put [string trimright [subst $subgroup]]
00371 }
00372 }
00373 return
00374 }
00375
00376 /* htmlparse::PrepareHtml --*/
00377 /* */
00378 /* Internal helper command of '::htmlparse::parse'. Removes*/
00379 /* leading DOCTYPE declarations and comments, protects the*/
00380 /* special characters of tcl from evaluation.*/
00381 /* */
00382 /* Arguments:*/
00383 /* html The HTML string to prepare*/
00384 /* */
00385 /* Side Effects:*/
00386 /* None.*/
00387 /* */
00388 /* Results:*/
00389 /* The provided HTML string with the described modifications*/
00390 /* applied to it.*/
00391
00392 ret ::htmlparse::PrepareHtml (type html) {
00393 # Remove the following items from the text:
00394 # - A leading <!DOCTYPE...> declaration.
00395 # - All comments <!-- ... -->
00396 #
00397 # Also normalize the line endings (\r -> \n).
00398
00399 # Tcllib SF Bug 861287 - Processing of comments.
00400 # Recognize EOC by RE, instead of fixed string.
00401
00402 set html [string map [list \r \n] $html]
00403
00404 regsub -- "^.*<!DOCTYPE\[^>\]*>" $html {} html
00405 regsub -all -- "--(\[ \t\n\]*)>" $html "\001\\1\002" html
00406
00407 # Recognize borken beginnings of a comment and convert them to PCDATA.
00408 regsub -all -- "<--(\[^\001\]*)\001(\[^\002\]*)\002" $html {\<--\1--\2\>} html
00409
00410 # And now recognize true comments, remove them.
00411 regsub -all -- "<!--\[^\001\]*\001(\[^\002\]*)\002" $html {} html
00412
00413 # Protect characters special to tcl (braces, slashes) by
00414 # converting them to their escape sequences.
00415
00416 return [string map [list \
00417 "\{" "{" \
00418 "\}" "}" \
00419 "\\" "\"] $html]
00420 }
00421
00422
00423
00424 /* htmlparse::debugCallback --*/
00425 /* */
00426 /* The standard callback used by the parser in*/
00427 /* '::htmlparse::parse' if none was specified by the user. Simply*/
00428 /* dumps its arguments to stdout. This callback can be used for*/
00429 /* both normal and incremental mode of the calling parser. In*/
00430 /* other words, it accepts four or five arguments. The last four*/
00431 /* arguments are described below. The optional fifth argument*/
00432 /* contains the clientdata value given to the callback by a*/
00433 /* parser in incremental mode. All callbacks have to follow the*/
00434 /* signature of this command in the last four arguments, and*/
00435 /* callbacks used in incremental parsing have to follow this*/
00436 /* signature in the last five arguments.*/
00437 /* */
00438 /* Arguments:*/
00439 /* tag The name of the tag currently*/
00440 /* processed by the parser.*/
00441 /* */
00442 /* slash Either empty or a slash. Allows us to*/
00443 /* distinguish between opening (slash is*/
00444 /* empty) and closing tags (slash is*/
00445 /* equal to a '/').*/
00446 /* */
00447 /* param The un-interpreted list of parameters*/
00448 /* to the tag.*/
00449 /* */
00450 /* textBehindTheTag The text found by the parser behind*/
00451 /* the tag named in 'tag'.*/
00452 /* */
00453 /* Side Effects:*/
00454 /* None.*/
00455 /* */
00456 /* Results:*/
00457 /* None.*/
00458
00459 ret ::htmlparse::debugCallback (type args) {
00460 # args = ?clientData? tag slash param textBehindTheTag
00461 puts "==> $args"
00462 return
00463 }
00464
00465 /* htmlparse::mapEscapes --*/
00466 /* */
00467 /* Takes a HTML string, substitutes all escape sequences with*/
00468 /* their actual characters and returns the resulting string.*/
00469 /* HTML not containing escape sequences or invalid escape*/
00470 /* sequences is returned unchanged.*/
00471 /* */
00472 /* Arguments:*/
00473 /* html The string to modify*/
00474 /* */
00475 /* Side Effects:*/
00476 /* None.*/
00477 /* */
00478 /* Results:*/
00479 /* The argument string with all escape sequences replaced with*/
00480 /* their actual characters.*/
00481
00482 ret ::htmlparse::mapEscapes (type html) {
00483 # Find HTML escape characters of the form &xxx(;|EOW)
00484
00485 # Quote special Tcl chars so they pass through [subst] unharmed.
00486 set new [string map [list \] \\\] \[ \\\[ \$ \\\$ \\ \\\\] $html]
00487 regsub -all -- {&([[:alnum:]]{2,7})(;|\M)} $new {[DoNamedMap \1 {\2}]} new
00488 regsub -all -- {&#([[:digit:]]{1,5})(;|\M)} $new {[DoDecMap \1 {\2}]} new
00489 regsub -all -- {&#x([[:xdigit:]]{1,4})(;|\M)} $new {[DoHexMap \1 {\2}]} new
00490 return [subst $new]
00491 }
00492
00493 ret ::htmlparse::DoNamedMap (type name , type endOf) {
00494 variable namedEntities
00495 if {[info exist namedEntities($name)]} {
00496 return $namedEntities($name)
00497 } else {
00498 # Put it back..
00499 return "&$name$endOf"
00500 }
00501 }
00502
00503 ret ::htmlparse::DoDecMap (type dec , type endOf) {
00504 scan $dec %d dec
00505 if {$dec <= 0xFFFD} {
00506 return [format %c $dec]
00507 } else {
00508 # Put it back..
00509 return "&#$dec$endOf"
00510 }
00511 }
00512
00513 ret ::htmlparse::DoHexMap (type hex , type endOf) {
00514 scan $hex %x value
00515 if {$value <= 0xFFFD} {
00516 return [format %c $value]
00517 } else {
00518 # Put it back..
00519 return "&#x$hex$endOf"
00520 }
00521 }
00522
00523 /* htmlparse::2tree --*/
00524 /* */
00525 /* This command is a wrapper around '::htmlparse::parse' which*/
00526 /* takes a HTML string and converts it into a tree containing the*/
00527 /* logical structure of the parsed document. The tree object has*/
00528 /* to be created by the caller. It is also expected that the tree*/
00529 /* object provides the same interface as the tree object from*/
00530 /* tcllib -> struct. It doesn't have to come from that module*/
00531 /* though. The internal callback does some basic checking of HTML*/
00532 /* validity and tries to recover from the most basic errors.*/
00533 /* */
00534 /* Arguments:*/
00535 /* html The HTML string to parse and convert.*/
00536 /* tree The name of the tree to fill.*/
00537 /* */
00538 /* Side Effects:*/
00539 /* Creates a tree object (see tcllib -> struct)*/
00540 /* and modifies it.*/
00541 /* */
00542 /* Results:*/
00543 /* The contents of 'tree'.*/
00544
00545 ret ::htmlparse::2tree (type html , type tree) {
00546
00547 # One internal datastructure is required, a stack of open
00548 # tags. This stack is also provided by the 'struct' module of
00549 # tcllib. As the operation of this command is synchronuous we
00550 # don't have to take care against multiple running copies at the
00551 # same times (Such are possible, but will be in different
00552 # interpreters and true concurrency is possible only if they are
00553 # in different threads too). IOW, no need for tricks to make the
00554 # internal datastructure unique.
00555
00556 catch {::htmlparse::tags destroy}
00557
00558 ::struct::stack ::htmlparse::tags
00559 ::htmlparse::tags push root
00560 $tree set root type root
00561
00562 parse -cmd [list ::htmlparse::2treeCallback $tree] $html
00563
00564 # A bit hackish, correct the ordering of nodes for the optional
00565 # tag types, over a larger area when was seen by the parser itself.
00566
00567 $tree walk root -order post n {
00568 ::htmlparse::Reorder $tree $n
00569 }
00570
00571 ::htmlparse::tags destroy
00572 return $tree
00573 }
00574
00575 /* htmlparse::2treeCallback --*/
00576 /* */
00577 /* Internal helper command. A special callback to*/
00578 /* '::htmlparse::parse' used by '::htmlparse::2tree' which takes*/
00579 /* the incoming stream of tags and converts them into a tree*/
00580 /* representing the inner structure of the parsed HTML*/
00581 /* document. Recovers from simple HTML errors like missing*/
00582 /* opening tags, missing closing tags and overlapping tags.*/
00583 /* */
00584 /* Arguments:*/
00585 /* tree The name of the tree to manipulate.*/
00586 /* tag See '::htmlparse::debugCallback'.*/
00587 /* slash See '::htmlparse::debugCallback'.*/
00588 /* param See '::htmlparse::debugCallback'.*/
00589 /* textBehindTheTag See '::htmlparse::debugCallback'.*/
00590 /* */
00591 /* Side Effects:*/
00592 /* Manipulates the tree object whose name was given as the first*/
00593 /* argument.*/
00594 /* */
00595 /* Results:*/
00596 /* None.*/
00597
00598 ret ::htmlparse::2treeCallback (type tree , type tag , type slash , type param , type textBehindTheTag) {
00599 # This could be table-driven I think but for now the switches
00600 # should work fine.
00601
00602 # Normalize tag information for later comparisons. Also remove
00603 # superfluous whitespace. Don't forget to decode the standard
00604 # entities.
00605
00606 set tag [string tolower $tag]
00607 set textBehindTheTag [string trim $textBehindTheTag]
00608 if {$textBehindTheTag != {}} {
00609 set text [mapEscapes $textBehindTheTag]
00610 }
00611
00612 if {"$slash" == "/"} {
00613 # Handle closing tags. Standard operation is to pop the tag
00614 # from the stack of open tags. We don't do this for </p> and
00615 # </li>. As they were optional they were never pushed onto the
00616 # stack (Well, actually they are just popped immediately after
00617 # they were pusheed, see below).
00618
00619 switch -exact -- $tag {
00620 base - option - meta - li - p {
00621 # Ignore, nothing to do.
00622 }
00623 default {
00624 # The moment we get a closing tag which does not match
00625 # the tag on the stack we have two possibilities on how
00626 # this came into existence to choose from:
00627 #
00628 # a) A tag is now closed but was never opened.
00629 # b) A tag requiring an end tag was opened but the end
00630 # tag was omitted and we now are at a tag which was
00631 # opened before the one with the omitted end tag.
00632
00633 # NOTE:
00634 # Pages delivered from the amazon.uk site contain both
00635 # cases: </a> without opening, <b> & <font> without
00636 # closing. Another error: <a><b></a></b>, i.e. overlapping
00637 # tags. Fortunately this can be handled by the algorithm
00638 # below, in two cycles, one of which is case (b), followed
00639 # by case (a). It seems as if Amazon/UK believes that visual
00640 # markup like <b> and <font> is an option (switch-on) instead
00641 # of a region.
00642
00643 # Algorithm used here to deal with these:
00644 # 1) Search whole stack for the matching opening tag.
00645 # If there is one assume case (b) and pop everything
00646 # until and including this opening tag.
00647 # 2) If no matching opening tag was found assume case
00648 # (a) and ignore the tag.
00649 #
00650 # Part (1) also subsumes the normal case, i.e. the
00651 # matching tag is at the top of the stack.
00652
00653 set nodes [::htmlparse::tags peek [::htmlparse::tags size]]
00654 # Note: First item is top of stack, last item is bottom of stack !
00655 # (This behaviour of tcllib stacks is not documented
00656 # -> we should update the manpage).
00657
00658 #foreach n $nodes {lappend tstring [p get $n -key type]}
00659 #puts stderr --[join $tstring]--
00660
00661 set level 1
00662 set found 0
00663 foreach n $nodes {
00664 set type [$tree get $n type]
00665 if {0 == [string compare $tag $type]} {
00666 # Found an earlier open tag -> (b).
00667 set found 1
00668 break
00669 }
00670 incr level
00671 }
00672 if {$found} {
00673 ::htmlparse::tags pop $level
00674 if {$level > 1} {
00675 #foreach n $nodes {lappend tstring [$tree get $n type]}
00676 #puts stderr "\tdesync at <$tag> ($tstring) => pop $level"
00677 }
00678 } else {
00679 #foreach n $nodes {lappend tstring [$tree get $n type]}
00680 #puts stderr "\tdesync at <$tag> ($tstring) => ignore"
00681 }
00682 }
00683 }
00684
00685 # If there is text behind a closing tag X it belongs to the
00686 # parent tag of X.
00687
00688 if {$textBehindTheTag != {}} {
00689 # Attach the text behind the closing tag to the reopened
00690 # context.
00691
00692 set pcd [$tree insert [::htmlparse::tags peek] end]
00693 $tree set $pcd type PCDATA
00694 $tree set $pcd data $textBehindTheTag
00695 }
00696
00697 } else {
00698 # Handle opening tags. The standard operation for most is to
00699 # push them onto the stack and thus open a nested context.
00700 # This does not happen for both the optional tags (p, li) and
00701 # the ones which don't have closing tags (meta, br, option,
00702 # input, area, img).
00703 #
00704 # The text coming with the tag will be added after the tag if
00705 # it is a tag without a matching close, else it will be added
00706 # as a node below the tag (as it is the region between the
00707 # opening and closing tag and thus nested inside). Empty text
00708 # is ignored under all circcumstances.
00709
00710 set node [$tree insert [::htmlparse::tags peek] end]
00711 $tree set $node type $tag
00712 $tree set $node data $param
00713
00714 if {$textBehindTheTag != {}} {
00715 switch -exact -- $tag {
00716 input - area - img - br {
00717 set pcd [$tree insert [::htmlparse::tags peek] end]
00718 }
00719 default {
00720 set pcd [$tree insert $node end]
00721 }
00722 }
00723 $tree set $pcd type PCDATA
00724 $tree set $pcd data $textBehindTheTag
00725 }
00726
00727 ::htmlparse::tags push $node
00728
00729 # Special handling: <p>, <li> may have no closing tag => pop
00730 # : them immediately.
00731 #
00732 # Special handling: <meta>, <br>, <option>, <input>, <area>,
00733 # : <img>: no closing tags for these.
00734
00735 switch -exact -- $tag {
00736 hr - base - meta - li - br - option - input - area - img - p - h1 - h2 - h3 - h4 - h5 - h6 {
00737 ::htmlparse::tags pop
00738 }
00739 default {}
00740 }
00741 }
00742 }
00743
00744 /* htmlparse::removeVisualFluff --*/
00745 /* */
00746 /* This command walks a tree as generated by '::htmlparse::2tree'*/
00747 /* and removes all the nodes which represent visual tags and not*/
00748 /* structural ones. The purpose of the command is to make the*/
00749 /* tree easier to navigate without getting bogged down in visual*/
00750 /* information not relevant to the search.*/
00751 /* */
00752 /* Arguments:*/
00753 /* tree The name of the tree to cut down.*/
00754 /* */
00755 /* Side Effects:*/
00756 /* Modifies the specified tree.*/
00757 /* */
00758 /* Results:*/
00759 /* None.*/
00760
00761 ret ::htmlparse::removeVisualFluff (type tree) {
00762 $tree walk root -order post n {
00763 ::htmlparse::RemoveVisualFluff $tree $n
00764 }
00765 return
00766 }
00767
00768 /* htmlparse::removeFormDefs --*/
00769 /* */
00770 /* Like '::htmlparse::removeVisualFluff' this command is here to*/
00771 /* cut down on the size of the tree as generated by*/
00772 /* '::htmlparse::2tree'. It removes all nodes representing forms*/
00773 /* and form elements.*/
00774 /* */
00775 /* Arguments:*/
00776 /* tree The name of the tree to cut down.*/
00777 /* */
00778 /* Side Effects:*/
00779 /* Modifies the specified tree.*/
00780 /* */
00781 /* Results:*/
00782 /* None.*/
00783
00784 ret ::htmlparse::removeFormDefs (type tree) {
00785 $tree walk root -order post n {
00786 ::htmlparse::RemoveFormDefs $tree $n
00787 }
00788 return
00789 }
00790
00791 /* htmlparse::RemoveVisualFluff --*/
00792 /* */
00793 /* Internal helper command to*/
00794 /* '::htmlparse::removeVisualFluff'. Does the actual work.*/
00795 /* */
00796 /* Arguments:*/
00797 /* tree The name of the tree currently processed*/
00798 /* node The name of the node to look at.*/
00799 /* */
00800 /* Side Effects:*/
00801 /* Modifies the specified tree.*/
00802 /* */
00803 /* Results:*/
00804 /* None.*/
00805
00806 ret ::htmlparse::RemoveVisualFluff (type tree , type node) {
00807 switch -exact -- [$tree get $node type] {
00808 hmstart - html - font - center - div - sup - b - i {
00809 # Removes the node, but does not affect the nodes below
00810 # it. These are just made into chiildren of the parent of
00811 # this node, in its place.
00812
00813 $tree cut $node
00814 }
00815 script - option - select - meta - map - img {
00816 # Removes this node and everything below it.
00817 $tree delete $node
00818 }
00819 default {
00820 # Ignore tag
00821 }
00822 }
00823 }
00824
00825 /* htmlparse::RemoveFormDefs --*/
00826 /* */
00827 /* Internal helper command to*/
00828 /* '::htmlparse::removeFormDefs'. Does the actual work.*/
00829 /* */
00830 /* Arguments:*/
00831 /* tree The name of the tree currently processed*/
00832 /* node The name of the node to look at.*/
00833 /* */
00834 /* Side Effects:*/
00835 /* Modifies the specified tree.*/
00836 /* */
00837 /* Results:*/
00838 /* None.*/
00839
00840 ret ::htmlparse::RemoveFormDefs (type tree , type node) {
00841 switch -exact -- [$tree get $node type] {
00842 form {
00843 $tree delete $node
00844 }
00845 default {
00846 # Ignore tag
00847 }
00848 }
00849 }
00850
00851 /* htmlparse::Reorder --*/
00852
00853 /* Internal helper command to '::htmlparse::2tree'. Moves the*/
00854 /* nodes between p/p, li/li and h<i> sequences below the*/
00855 /* paragraphs and items. IOW, corrects misconstructions for*/
00856 /* the optional node types.*/
00857 /* */
00858 /* Arguments:*/
00859 /* tree The name of the tree currently processed*/
00860 /* node The name of the node to look at.*/
00861 /* */
00862 /* Side Effects:*/
00863 /* Modifies the specified tree.*/
00864 /* */
00865 /* Results:*/
00866 /* None.*/
00867
00868 ret ::htmlparse::Reorder (type tree , type node) {
00869 switch -exact -- [set tp [$tree get $node type]] {
00870 h1 - h2 - h3 - h4 - h5 - h6 - p - li {
00871 # Look for right siblings until the next node with a
00872 # similar type (or end of level) and move these below this
00873 # node.
00874
00875 while {1} {
00876 set sibling [$tree next $node]
00877 if {
00878 ($sibling == {}) ||
00879 ([lsearch -exact {h1 h2 h3 h4 h5 h6 p li} [$tree get $sibling type]] != -1)
00880 } {
00881 break
00882 }
00883 $tree move $node end $sibling
00884 }
00885 }
00886 default {
00887 # Ignore tag
00888 }
00889 }
00890 }
00891
00892 /* ### ######### ###########################*/
00893
00894 package provide htmlparse 1.1.2
00895