modules/bibtex/bibtex.tcl

Go to the documentation of this file.
00001 /* */
00002 /* */
00003 /*  "BibTeX parser"*/
00004 /*  http://wiki.tcl.tk/13719*/
00005 /* */
00006 /*  Tcl code harvested on:   7 Mar 2005, 23:55 GMT*/
00007 /*  Wiki page last updated: ???*/
00008 /* */
00009 /* */
00010 
00011 /*  bibtex.tcl --*/
00012 /* */
00013 /*       A basic parser for BibTeX bibliography databases.*/
00014 /* */
00015 /*  Copyright (c) 2005 Neil Madden.*/
00016 /*  Copyright (c) 2005 Andreas Kupries.*/
00017 /*  License: Tcl/BSD style.*/
00018 
00019 /*  NOTES*/
00020 /* */
00021 /*  Need commands to introspect parser state. Especially the string*/
00022 /*  map (for testing of 'addStrings', should be useful in general as*/
00023 /*  well).*/
00024 
00025 /*  ### ### ### ######### ######### #########*/
00026 /*  Requisites*/
00027 
00028 package require Tcl 8.4
00029 package require cmdline
00030 
00031 /*  ### ### ### ######### ######### #########*/
00032 /*  Implementation: Public API*/
00033 
00034 namespace ::bibtex {}
00035 
00036 /*  bibtex::parse --*/
00037 /* */
00038 /*  Parse a bibtex file.*/
00039 /* */
00040 /*  parse ?options? ?bibtex?*/
00041 
00042 ret  ::bibtex::parse (type args) {
00043     variable data
00044     variable id
00045 
00046     # Argument processing
00047     if {[llength $args] < 1} {
00048     set err "[lindex [info level 0] 0] ?options? ?bibtex?"
00049     return -code error "wrong # args: should be \"$err\""
00050     }
00051 
00052     array set state {}
00053     GetOptions $args state
00054 
00055     # Initialize the parser state from the options, fill in default
00056     # values, and handle the input according the specified mode.
00057 
00058     set token bibtex[incr id]
00059     foreach {k v} [array get state] {
00060     set data($token,$k) $v
00061     }
00062 
00063     if {$state(stream)} {
00064     # Text not in memory
00065     if {!$state(bg)} {
00066         # Text from a channel, no async processing. We read everything
00067         # into memory and the handle it as before.
00068 
00069         set blockmode [fconfigure $state(-channel) -blocking]
00070         fconfigure $state(-channel) -blocking 1
00071         set data($token,buffer) [read $state(-channel)]
00072         fconfigure $state(-channel) -blocking $blockmode
00073 
00074         # Tell upcoming processing that the text is in memory.
00075         set state(stream) 0
00076     } else {
00077         # Text from a channel, and processing is async. Create an
00078         # event handler for the incoming data.
00079 
00080         set data($token,done) 0
00081         fileevent $state(-channel) readable \
00082             [list ::bibtex::ReadChan $token]
00083 
00084         # Initialize the parser internal result buffer if we use plain
00085         # -command, and not the SAX api.
00086         if {!$state(sax)} {
00087         set data($token,result) {}
00088         }
00089     }
00090     }
00091 
00092     # Initialize the string mappings (none known), and the result
00093     # accumulator.
00094     set data($token,strings) {}
00095     set data($token,result)  {}
00096 
00097     if {!$state(stream)} {
00098     ParseRecords $token 1
00099     if {$state(sax)} {
00100         set result $token
00101     } else {
00102         set result $data($token,result)
00103         destroy $token
00104     }
00105     return $result
00106     }
00107 
00108     # Assert: Processing is in background.
00109     return $token
00110 }
00111 
00112 /*  Cleanup a parser, cancelling any callbacks etc.*/
00113 
00114 ret  ::bibtex::destroy (type token) {
00115     variable data
00116 
00117     if {![info exists data($token,stream)]} {
00118     return -code error "Illegal bibtex parser \"$token\""
00119     }
00120     if {$data($token,stream)} {
00121     fileevent $data($token,-channel) readable {}
00122     }
00123 
00124     array unset data $token,*
00125     return
00126 }
00127 
00128 
00129 ret  ::bibtex::wait (type token) {
00130     variable data
00131 
00132     if {![info exists data($token,stream)]} {
00133     return -code error "Illegal bibtex parser \"$token\""
00134     }
00135     vwait ::bibtex::data($token,done)
00136     return
00137 }
00138 
00139 /*  bibtex::addStrings --*/
00140 /* */
00141 /*  Add strings to the map for a particular parser. All strings are*/
00142 /*  expanded at parse time.*/
00143 
00144 ret  ::bibtex::addStrings (type token , type strings) {
00145     variable data
00146     eval [linsert $strings 0 lappend data($token,strings)]
00147     return
00148 }
00149 
00150 /*  ### ### ### ######### ######### #########*/
00151 /*  Implementation: Private utility routines*/
00152 
00153 ret  ::bibtex::AddRecord (type token , type type , type key , type recdata) {
00154     variable data
00155     lappend  data($token,result) [list $type $key $recdata]
00156     return
00157 }
00158 
00159 ret  ::bibtex::GetOptions (type argv , type statevar) {
00160     upvar 1 $statevar state
00161 
00162     # Basic processing of the argument list
00163     # and the options found therein.
00164 
00165     set opts [lrange [::cmdline::GetOptionDefaults {
00166     {command.arg         {}}
00167     {channel.arg         {}}
00168     {recordcommand.arg   {}}
00169     {preamblecommand.arg {}}
00170     {stringcommand.arg   {}}
00171     {commentcommand.arg  {}}
00172     {progresscommand.arg {}}
00173     } result] 2 end] ;# Remove ? and help.
00174 
00175     set argc [llength $argv]
00176     while {[set err [::cmdline::getopt argv $opts opt arg]]} {
00177     if {$err < 0} {
00178         set olist ""
00179         foreach o [lsort $opts] {
00180         if {[string match *.arg $o]} {
00181             set o [string range $o 0 end-4]
00182         }
00183         lappend olist -$o
00184         }
00185         return -code error "bad option \"$opt\",\
00186             should be one of\
00187             [linsert [join $olist ", "] end-1 or]"
00188     }
00189     set state(-$opt) $arg
00190     }
00191 
00192     # Check the information gained so far
00193     # for inconsistencies and/or missing
00194     # pieces.
00195 
00196     set sax [expr {
00197     [info exists state(-recordcommand)]   ||
00198     [info exists state(-preamblecommand)] ||
00199     [info exists state(-stringcommand)]   ||
00200     [info exists state(-commentcommand)]  ||
00201     [info exists state(-progresscommand)]
00202     }] ; # {}
00203 
00204     set bg [info exists state(-command)]
00205 
00206     if {$sax && $bg} {
00207     # Sax callbacks and channel completion callback exclude each
00208     # other.
00209     return -code error "The options -command and -TYPEcommand exclude each other"
00210     }
00211 
00212     set stream [info exists state(-channel)]
00213 
00214     if {$stream} {
00215     # Channel is present, a text is not allowed.
00216     if {[llength $argv]} {
00217         return -code error "Option -channel and text exclude each other"
00218     }
00219 
00220     # The channel has to exist as well.
00221     if {[lsearch -exact [file channels] $state(-channel)] < 0} {
00222         return -code error "Illegal channel handle \"$state(-channel)\""
00223     }
00224     } else {
00225     # Channel is not present, we have to have a text, and only
00226     # exactly one. And a general -command callback is not allowed.
00227 
00228     if {![llength $argv]} {
00229         return -code error "Neither -channel nor text specified"
00230     } elseif {[llength $argv] > 1} {
00231         return -code error "wrong # args: [lindex [info level 1] 0] ?options? ?bibtex?"
00232     }
00233 
00234     # Channel completion callback is not allowed if we are not
00235     # reading from a channel.
00236 
00237     if {$bg} {
00238         return -code error "Option -command and text exclude each other"
00239     }
00240 
00241     set state(buffer) [lindex $argv 0]
00242     }
00243 
00244     set state(stream) $stream
00245     set state(sax)    $sax
00246     set state(bg)     [expr {$sax || $bg}]
00247 
00248     if {![info exists state(-stringcommand)]} {
00249     set state(-stringcommand) [list ::bibtex::addStrings]
00250     }
00251     if {![info exists state(-recordcommand)] && (!$sax)} {
00252     set state(-recordcommand) [list ::bibtex::AddRecord]
00253     }
00254     return
00255 }
00256 
00257 ret  ::bibtex::Callback (type token , type type , type args) {
00258     variable data
00259 
00260     #puts stdout "Callback ($token $type ($args))"
00261 
00262     if {[info exists data($token,-${type}command)]} {
00263     eval $data($token,-${type}command) [linsert $args 0 $token]
00264     }
00265     return
00266 }
00267 
00268 ret  ::bibtex::ReadChan (type token) {
00269     variable data
00270 
00271     # Read the waiting characters into our buffer and process
00272     # them. The records are saved either through a user supplied
00273     # record callback, or the standard callback for our non-sax
00274     # processing.
00275 
00276     set    chan $data($token,-channel)
00277     append data($token,buffer) [read $chan]
00278 
00279     if {[eof $chan]} {
00280     # Final processing. In non-SAX mode we have to deliver the
00281     # completed result before destroying the parser.
00282 
00283     ParseRecords $token 1
00284     set data($token,done) 1
00285     if {!$data($token,sax)} {
00286         Callback $token {} $data($token,result)
00287     }
00288     return
00289     }
00290 
00291     # Processing of partial data.
00292 
00293     ParseRecords $token 0
00294     return
00295 }
00296 
00297 ret  ::bibtex::Tidy (type str) {
00298     return [string tolower [string trim $str]]
00299 }
00300 
00301 ret  ::bibtex::ParseRecords (type token , type eof) {
00302     # A rough BibTeX grammar (case-insensitive):
00303     #
00304     # Database      ::= (Junk '@' Entry)*
00305     # Junk          ::= .*?
00306     # Entry         ::= Record
00307     #               |   Comment
00308     #               |   String
00309     #               |   Preamble
00310     # Comment       ::= "comment" [^\n]* \n         -- ignored
00311     # String        ::= "string" '{' Field* '}'
00312     # Preamble      ::= "preamble" '{' .* '}'       -- (balanced)
00313     # Record        ::= Type '{' Key ',' Field* '}'
00314     #               |   Type '(' Key ',' Field* ')' -- not handled
00315     # Type          ::= Name
00316     # Key           ::= Name
00317     # Field         ::= Name '=' Value
00318     # Name          ::= [^\s\"#%'(){}]*
00319     # Value         ::= [0-9]+
00320     #               |   '"' ([^'"']|\\'"')* '"'
00321     #               |   '{' .* '}'                  -- (balanced)
00322 
00323     # " - Fixup emacs hilit confusion from the grammar above.
00324     variable data
00325     set bibtex $data($token,buffer)
00326 
00327     # Split at each @ character which is at the beginning of a line,
00328     # modulo whitespace. This is a heuristic to distinguish the @'s
00329     # starting a new record from the @'s occuring inside a record, as
00330     # part of email addresses. Empty pices at beginning or end are
00331     # stripped before the split.
00332 
00333     regsub -line -all {^[\n\r\f\t ]*@} $bibtex \000 bibtex
00334     set db [split [string trim $bibtex \000] \000]
00335 
00336     if {$eof} {
00337     set total [llength $db]
00338     set step  [expr {double($total) / 100.0}]
00339     set istep [expr {$step > 1 ? int($step) : 1}]
00340     set count 0
00341     } else {
00342     if {[llength $db] < 2} {
00343         # Nothing to process, or data which ay be incomplete.
00344         return
00345     }
00346 
00347     set data($token,buffer) [lindex $db end]
00348     set db                  [lrange $db 0 end-1]
00349 
00350     # Fake progress meter.
00351     set count -1
00352     }
00353 
00354     foreach block $db {
00355     if {$count < 0} {
00356         Callback $token progress -1
00357     } elseif {([incr count] % $istep) == 0} {
00358         Callback $token progress [expr {int($count / $step)}]
00359     }
00360     if {[regexp -nocase {\s*comment([^\n])*\n(.*)} $block \
00361         -> cmnt rest]} {
00362         # Are @comments blocks, or just 1 line?
00363         # Does anyone care?
00364         Callback $token comment $cmnt
00365 
00366     } elseif {[regexp -nocase {\s*string[^\{]*\{(.*)\}[^\}]*} \
00367         $block -> rest]} {
00368         # string macro defs
00369         Callback $token string [ParseBlock $rest]
00370 
00371     } elseif {[regexp -nocase {\s*preamble[^\{]*\{(.*)\}[^\}]*} \
00372         $block -> rest]} {
00373         Callback $token preamble $rest
00374 
00375     } elseif {[regexp {([^\{]+)\{([^,]*),(.*)\}[^\}]*} \
00376         $block -> type key rest]} {
00377         # Do any @string mappings (these are case insensitive)
00378         set rest [string map -nocase $data($token,strings) $rest]
00379         Callback $token record [Tidy $type] [string trim $key] \
00380             [ParseBlock $rest]
00381     } else {
00382         ## FUTURE: Use a logger.
00383         puts stderr "Skipping: $block"
00384     }
00385     }
00386 }
00387 
00388 ret  ::bibtex::ParseBlock (type block) {
00389     set ret   [list]
00390     set index 0
00391     while {
00392     [regexp -start $index -indices -- \
00393         {(\S+)[^=]*=(.*)} $block -> key rest]
00394     } {
00395     foreach {ks ke} $key break
00396     set k [Tidy [string range $block $ks $ke]]
00397     foreach {rs re} $rest break
00398     foreach {v index} \
00399         [ParseBibString $rs [string range $block $rs $re]] \
00400         break
00401     lappend ret $k $v
00402     }
00403     return $ret
00404 }
00405 
00406 ret  ::bibtex::ParseBibString (type index , type str) {
00407     set count 0
00408     set retstr ""
00409     set escape 0
00410     set string 0
00411     foreach char [split $str ""] {
00412     incr index
00413     if {$escape} {
00414         set escape 0
00415     } else {
00416         if {$char eq "\{"} {
00417         incr count
00418         continue
00419         } elseif {$char eq "\}"} {
00420         incr count -1
00421         if {$count < 0} {incr index -1; break}
00422         continue
00423         } elseif {$char eq ","} {
00424         if {$count == 0} break
00425         } elseif {$char eq "\\"} {
00426         set escape 1
00427         continue
00428         } elseif {$char eq "\""} {
00429         # Managing the count ensures that comma inside of a
00430         # string is not considered as the end of the field.
00431         if {!$string} {
00432             incr count
00433             set string 1
00434         } else {
00435             incr count -1
00436             set string 0
00437         }
00438         continue
00439         }
00440         # else: Nothing
00441     }
00442     append retstr $char
00443     }
00444     regsub -all {\s+} $retstr { } retstr
00445     return [list [string trim $retstr] $index]
00446 }
00447 
00448 
00449 /*  ### ### ### ######### ######### #########*/
00450 /*  Internal. Package configuration and state.*/
00451 
00452 namespace bibtex {
00453     /*  Counter for the generation of parser tokens.*/
00454     variable id 0
00455 
00456     /*  State of all parsers. Keys for each parser are prefixed with the*/
00457     /*  parser token.*/
00458     variable  data
00459     array  data =  {}
00460 
00461     /*  Keys and their meaning (listed without token prefix)*/
00462     /** 
00463      * buffer
00464      * eof
00465      * channel    <-\/- Difference ?
00466      * strings      |
00467      * -async       |
00468      * -blocksize   |
00469      * -channel   <-/
00470      * -recordcommand   -- callback for each record
00471      * -preamblecommand -- callback for @preamble blocks
00472      * -stringcommand   -- callback for @string macros
00473      * -commentcommand  -- callback for @comment blocks
00474      * -progresscommand -- callback to indicate progress of parse
00475      *#
00476  */
00477 }
00478 
00479 /*  ### ### ### ######### ######### #########*/
00480 /*  Ready to go*/
00481 package provide bibtex 0.5
00482 /*  EOF*/
00483