00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028 package require Tcl 8.4
00029 package require cmdline
00030
00031
00032
00033
00034 namespace ::bibtex {}
00035
00036
00037
00038
00039
00040
00041
00042 ret ::bibtex::parse (type args) {
00043 variable data
00044 variable id
00045
00046 # Argument processing
00047 if {[llength $args] < 1} {
00048 set err "[lindex [info level 0] 0] ?options? ?bibtex?"
00049 return -code error "wrong # args: should be \"$err\""
00050 }
00051
00052 array set state {}
00053 GetOptions $args state
00054
00055 # Initialize the parser state from the options, fill in default
00056 # values, and handle the input according the specified mode.
00057
00058 set token bibtex[incr id]
00059 foreach {k v} [array get state] {
00060 set data($token,$k) $v
00061 }
00062
00063 if {$state(stream)} {
00064 # Text not in memory
00065 if {!$state(bg)} {
00066 # Text from a channel, no async processing. We read everything
00067 # into memory and the handle it as before.
00068
00069 set blockmode [fconfigure $state(-channel) -blocking]
00070 fconfigure $state(-channel) -blocking 1
00071 set data($token,buffer) [read $state(-channel)]
00072 fconfigure $state(-channel) -blocking $blockmode
00073
00074 # Tell upcoming processing that the text is in memory.
00075 set state(stream) 0
00076 } else {
00077 # Text from a channel, and processing is async. Create an
00078 # event handler for the incoming data.
00079
00080 set data($token,done) 0
00081 fileevent $state(-channel) readable \
00082 [list ::bibtex::ReadChan $token]
00083
00084 # Initialize the parser internal result buffer if we use plain
00085 # -command, and not the SAX api.
00086 if {!$state(sax)} {
00087 set data($token,result) {}
00088 }
00089 }
00090 }
00091
00092 # Initialize the string mappings (none known), and the result
00093 # accumulator.
00094 set data($token,strings) {}
00095 set data($token,result) {}
00096
00097 if {!$state(stream)} {
00098 ParseRecords $token 1
00099 if {$state(sax)} {
00100 set result $token
00101 } else {
00102 set result $data($token,result)
00103 destroy $token
00104 }
00105 return $result
00106 }
00107
00108 # Assert: Processing is in background.
00109 return $token
00110 }
00111
00112
00113
00114 ret ::bibtex::destroy (type token) {
00115 variable data
00116
00117 if {![info exists data($token,stream)]} {
00118 return -code error "Illegal bibtex parser \"$token\""
00119 }
00120 if {$data($token,stream)} {
00121 fileevent $data($token,-channel) readable {}
00122 }
00123
00124 array unset data $token,*
00125 return
00126 }
00127
00128
00129 ret ::bibtex::wait (type token) {
00130 variable data
00131
00132 if {![info exists data($token,stream)]} {
00133 return -code error "Illegal bibtex parser \"$token\""
00134 }
00135 vwait ::bibtex::data($token,done)
00136 return
00137 }
00138
00139
00140
00141
00142
00143
00144 ret ::bibtex::addStrings (type token , type strings) {
00145 variable data
00146 eval [linsert $strings 0 lappend data($token,strings)]
00147 return
00148 }
00149
00150
00151
00152
00153 ret ::bibtex::AddRecord (type token , type type , type key , type recdata) {
00154 variable data
00155 lappend data($token,result) [list $type $key $recdata]
00156 return
00157 }
00158
00159 ret ::bibtex::GetOptions (type argv , type statevar) {
00160 upvar 1 $statevar state
00161
00162 # Basic processing of the argument list
00163 # and the options found therein.
00164
00165 set opts [lrange [::cmdline::GetOptionDefaults {
00166 {command.arg {}}
00167 {channel.arg {}}
00168 {recordcommand.arg {}}
00169 {preamblecommand.arg {}}
00170 {stringcommand.arg {}}
00171 {commentcommand.arg {}}
00172 {progresscommand.arg {}}
00173 } result] 2 end] ;# Remove ? and help.
00174
00175 set argc [llength $argv]
00176 while {[set err [::cmdline::getopt argv $opts opt arg]]} {
00177 if {$err < 0} {
00178 set olist ""
00179 foreach o [lsort $opts] {
00180 if {[string match *.arg $o]} {
00181 set o [string range $o 0 end-4]
00182 }
00183 lappend olist -$o
00184 }
00185 return -code error "bad option \"$opt\",\
00186 should be one of\
00187 [linsert [join $olist ", "] end-1 or]"
00188 }
00189 set state(-$opt) $arg
00190 }
00191
00192 # Check the information gained so far
00193 # for inconsistencies and/or missing
00194 # pieces.
00195
00196 set sax [expr {
00197 [info exists state(-recordcommand)] ||
00198 [info exists state(-preamblecommand)] ||
00199 [info exists state(-stringcommand)] ||
00200 [info exists state(-commentcommand)] ||
00201 [info exists state(-progresscommand)]
00202 }] ; # {}
00203
00204 set bg [info exists state(-command)]
00205
00206 if {$sax && $bg} {
00207 # Sax callbacks and channel completion callback exclude each
00208 # other.
00209 return -code error "The options -command and -TYPEcommand exclude each other"
00210 }
00211
00212 set stream [info exists state(-channel)]
00213
00214 if {$stream} {
00215 # Channel is present, a text is not allowed.
00216 if {[llength $argv]} {
00217 return -code error "Option -channel and text exclude each other"
00218 }
00219
00220 # The channel has to exist as well.
00221 if {[lsearch -exact [file channels] $state(-channel)] < 0} {
00222 return -code error "Illegal channel handle \"$state(-channel)\""
00223 }
00224 } else {
00225 # Channel is not present, we have to have a text, and only
00226 # exactly one. And a general -command callback is not allowed.
00227
00228 if {![llength $argv]} {
00229 return -code error "Neither -channel nor text specified"
00230 } elseif {[llength $argv] > 1} {
00231 return -code error "wrong # args: [lindex [info level 1] 0] ?options? ?bibtex?"
00232 }
00233
00234 # Channel completion callback is not allowed if we are not
00235 # reading from a channel.
00236
00237 if {$bg} {
00238 return -code error "Option -command and text exclude each other"
00239 }
00240
00241 set state(buffer) [lindex $argv 0]
00242 }
00243
00244 set state(stream) $stream
00245 set state(sax) $sax
00246 set state(bg) [expr {$sax || $bg}]
00247
00248 if {![info exists state(-stringcommand)]} {
00249 set state(-stringcommand) [list ::bibtex::addStrings]
00250 }
00251 if {![info exists state(-recordcommand)] && (!$sax)} {
00252 set state(-recordcommand) [list ::bibtex::AddRecord]
00253 }
00254 return
00255 }
00256
00257 ret ::bibtex::Callback (type token , type type , type args) {
00258 variable data
00259
00260 #puts stdout "Callback ($token $type ($args))"
00261
00262 if {[info exists data($token,-${type}command)]} {
00263 eval $data($token,-${type}command) [linsert $args 0 $token]
00264 }
00265 return
00266 }
00267
00268 ret ::bibtex::ReadChan (type token) {
00269 variable data
00270
00271 # Read the waiting characters into our buffer and process
00272 # them. The records are saved either through a user supplied
00273 # record callback, or the standard callback for our non-sax
00274 # processing.
00275
00276 set chan $data($token,-channel)
00277 append data($token,buffer) [read $chan]
00278
00279 if {[eof $chan]} {
00280 # Final processing. In non-SAX mode we have to deliver the
00281 # completed result before destroying the parser.
00282
00283 ParseRecords $token 1
00284 set data($token,done) 1
00285 if {!$data($token,sax)} {
00286 Callback $token {} $data($token,result)
00287 }
00288 return
00289 }
00290
00291 # Processing of partial data.
00292
00293 ParseRecords $token 0
00294 return
00295 }
00296
00297 ret ::bibtex::Tidy (type str) {
00298 return [string tolower [string trim $str]]
00299 }
00300
00301 ret ::bibtex::ParseRecords (type token , type eof) {
00302 # A rough BibTeX grammar (case-insensitive):
00303 #
00304 # Database ::= (Junk '@' Entry)*
00305 # Junk ::= .*?
00306 # Entry ::= Record
00307 # | Comment
00308 # | String
00309 # | Preamble
00310 # Comment ::= "comment" [^\n]* \n -- ignored
00311 # String ::= "string" '{' Field* '}'
00312 # Preamble ::= "preamble" '{' .* '}' -- (balanced)
00313 # Record ::= Type '{' Key ',' Field* '}'
00314 # | Type '(' Key ',' Field* ')' -- not handled
00315 # Type ::= Name
00316 # Key ::= Name
00317 # Field ::= Name '=' Value
00318 # Name ::= [^\s\"#%'(){}]*
00319 # Value ::= [0-9]+
00320 # | '"' ([^'"']|\\'"')* '"'
00321 # | '{' .* '}' -- (balanced)
00322
00323 # " - Fixup emacs hilit confusion from the grammar above.
00324 variable data
00325 set bibtex $data($token,buffer)
00326
00327 # Split at each @ character which is at the beginning of a line,
00328 # modulo whitespace. This is a heuristic to distinguish the @'s
00329 # starting a new record from the @'s occuring inside a record, as
00330 # part of email addresses. Empty pices at beginning or end are
00331 # stripped before the split.
00332
00333 regsub -line -all {^[\n\r\f\t ]*@} $bibtex \000 bibtex
00334 set db [split [string trim $bibtex \000] \000]
00335
00336 if {$eof} {
00337 set total [llength $db]
00338 set step [expr {double($total) / 100.0}]
00339 set istep [expr {$step > 1 ? int($step) : 1}]
00340 set count 0
00341 } else {
00342 if {[llength $db] < 2} {
00343 # Nothing to process, or data which ay be incomplete.
00344 return
00345 }
00346
00347 set data($token,buffer) [lindex $db end]
00348 set db [lrange $db 0 end-1]
00349
00350 # Fake progress meter.
00351 set count -1
00352 }
00353
00354 foreach block $db {
00355 if {$count < 0} {
00356 Callback $token progress -1
00357 } elseif {([incr count] % $istep) == 0} {
00358 Callback $token progress [expr {int($count / $step)}]
00359 }
00360 if {[regexp -nocase {\s*comment([^\n])*\n(.*)} $block \
00361 -> cmnt rest]} {
00362 # Are @comments blocks, or just 1 line?
00363 # Does anyone care?
00364 Callback $token comment $cmnt
00365
00366 } elseif {[regexp -nocase {\s*string[^\{]*\{(.*)\}[^\}]*} \
00367 $block -> rest]} {
00368 # string macro defs
00369 Callback $token string [ParseBlock $rest]
00370
00371 } elseif {[regexp -nocase {\s*preamble[^\{]*\{(.*)\}[^\}]*} \
00372 $block -> rest]} {
00373 Callback $token preamble $rest
00374
00375 } elseif {[regexp {([^\{]+)\{([^,]*),(.*)\}[^\}]*} \
00376 $block -> type key rest]} {
00377 # Do any @string mappings (these are case insensitive)
00378 set rest [string map -nocase $data($token,strings) $rest]
00379 Callback $token record [Tidy $type] [string trim $key] \
00380 [ParseBlock $rest]
00381 } else {
00382 ## FUTURE: Use a logger.
00383 puts stderr "Skipping: $block"
00384 }
00385 }
00386 }
00387
00388 ret ::bibtex::ParseBlock (type block) {
00389 set ret [list]
00390 set index 0
00391 while {
00392 [regexp -start $index -indices -- \
00393 {(\S+)[^=]*=(.*)} $block -> key rest]
00394 } {
00395 foreach {ks ke} $key break
00396 set k [Tidy [string range $block $ks $ke]]
00397 foreach {rs re} $rest break
00398 foreach {v index} \
00399 [ParseBibString $rs [string range $block $rs $re]] \
00400 break
00401 lappend ret $k $v
00402 }
00403 return $ret
00404 }
00405
00406 ret ::bibtex::ParseBibString (type index , type str) {
00407 set count 0
00408 set retstr ""
00409 set escape 0
00410 set string 0
00411 foreach char [split $str ""] {
00412 incr index
00413 if {$escape} {
00414 set escape 0
00415 } else {
00416 if {$char eq "\{"} {
00417 incr count
00418 continue
00419 } elseif {$char eq "\}"} {
00420 incr count -1
00421 if {$count < 0} {incr index -1; break}
00422 continue
00423 } elseif {$char eq ","} {
00424 if {$count == 0} break
00425 } elseif {$char eq "\\"} {
00426 set escape 1
00427 continue
00428 } elseif {$char eq "\""} {
00429 # Managing the count ensures that comma inside of a
00430 # string is not considered as the end of the field.
00431 if {!$string} {
00432 incr count
00433 set string 1
00434 } else {
00435 incr count -1
00436 set string 0
00437 }
00438 continue
00439 }
00440 # else: Nothing
00441 }
00442 append retstr $char
00443 }
00444 regsub -all {\s+} $retstr { } retstr
00445 return [list [string trim $retstr] $index]
00446 }
00447
00448
00449
00450
00451
00452 namespace bibtex {
00453
00454 variable id 0
00455
00456
00457
00458 variable data
00459 array data = {}
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477 }
00478
00479
00480
00481 package provide bibtex 0.5
00482
00483