00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016 package provide math::statistics 0.5
00017
00018
00019
00020
00021
00022 namespace ::math::statistics {
00023
00024
00025
00026 namespace export mean min max number var stdev pvar pstdev basic-stats corr \
00027 histogram interval-mean-stdev t-test-mean quantiles \
00028 test-normal lillieforsFit \
00029 autocorr crosscorr filter map samplescount median \
00030 test-2x2 print-2x2 control-xbar test_xbar \
00031 control-Rchart test-Rchart
00032
00033
00034
00035 variable NEGSTDEV {Zero or negative standard deviation}
00036 variable TOOFEWDATA {Too few or invalid data}
00037 variable OUTOFRANGE {Argument out of range}
00038
00039
00040
00041
00042 variable factorNormalPdf
00043 factorNormalPdf = [expr {sqrt(8.0*atan(1.0))}]
00044
00045
00046
00047
00048
00049
00050
00051 variable control_factors {
00052 A2 {1.880 1.093 0.729 0.577 0.483 0.419 0.419}
00053 D3 {0.0 0.0 0.0 0.0 0.0 0.076 0.076}
00054 D4 {3.267 2.574 2.282 2.114 2.004 1.924 1.924}
00055 }
00056 }
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067
00068
00069
00070 namespace ::math::statistics {
00071 foreach type {mean min max number stdev var pstdev pvar} {
00072 ret $type ( type values ) "BasicStats $type \$values"
00073 }
00074 proc basic-stats { values } "BasicStats all \$values"
00075 }
00076
00077 # BasicStats --
00078 # Return the one or all of the basic statistical properties
00079 #
00080 # Arguments:
00081 # type Type of value to be returned
00082 # values List of values to be examined
00083 #
00084 # Results:
00085 # Value that was required
00086 #
00087 proc ::math::statistics::BasicStats { type values } {
00088 variable TOOFEWDATA
00089
00090 set min {}
00091 set max {}
00092 set mean {}
00093 set stdev {}
00094 set var {}
00095
00096 set sum 0.0
00097 set sumsq 0.0
00098 set number 0
00099
00100 foreach value $values {
00101 if { $value == {} } {
00102 continue
00103 }
00104 value = [expr {double($value)}]
00105
00106 incr number
00107 sum = [expr {$sum+$value}]
00108 sumsq = [expr {$sumsq+$value*$value}]
00109
00110 if { $min == {} || $value < $min } {
00111 min = $value
00112 }
00113 if { $max == {} || $value > $max } {
00114 max = $value
00115 }
00116 }
00117
00118 if { $number > 0 } {
00119 mean = [expr {$sum/$number}]
00120 } else {
00121 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
00122 }
00123
00124 if { $number > 1 } {
00125 var = [expr {($sumsq-$mean*$sum)/double($number-1)}]
00126
00127
00128
00129
00130 if { $var < 0.0 } {
00131 var = 0.0
00132 }
00133 stdev = [expr {sqrt($var)}]
00134 }
00135 pvar = [expr {($sumsq-$mean*$sum)/double($number)}]
00136
00137
00138
00139
00140 if { $pvar < 0.0 } {
00141 pvar = 0.0
00142 }
00143 pstdev = [expr {sqrt($pvar)}]
00144
00145 all = [list $mean $min $max $number $stdev $var $pstdev $pvar]
00146
00147
00148
00149
00150 if { [lsearch {all mean min max number stdev var pstdev pvar} $type] >= 0 } {
00151
00152 return [ $type = ]
00153 } else {
00154 return -code error \
00155 -errorcode ARG -errorinfo [list unknown type of statistic -- $type] \
00156 [list unknown type of statistic -- $type]
00157 }
00158 }
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168
00169
00170
00171
00172 ret ::math::statistics::histogram ( type limits , type values ) {
00173
00174 if { [llength $limits] < 1 } {
00175 return -code error -errorcode ARG -errorinfo {No limits given} {No limits given}
00176 }
00177
00178 set limits [lsort -real -increasing $limits]
00179
00180 for { set index 0 } { $index <= [llength $limits] } { incr index } {
00181 set buckets($index) 0
00182 }
00183 set last [llength $limits]
00184
00185 foreach value $values {
00186 if { $value == {} } {
00187 continue
00188 }
00189
00190 set index 0
00191 set found 0
00192 foreach limit $limits {
00193 if { $value <= $limit } {
00194 set found 1
00195 incr buckets($index)
00196 break
00197 }
00198 incr index
00199 }
00200
00201 if { $found == 0 } {
00202 incr buckets($last)
00203 }
00204 }
00205
00206 set result {}
00207 for { set index 0 } { $index <= $last } { incr index } {
00208 lappend result $buckets($index)
00209 }
00210
00211 return $result
00212 }
00213
00214
00215
00216
00217
00218
00219
00220
00221
00222
00223
00224 ret ::math::statistics::corr ( type data1 , type data2 ) {
00225 variable TOOFEWDATA
00226
00227 set number 0
00228 set sum1 0.0
00229 set sum2 0.0
00230 set sumsq1 0.0
00231 set sumsq2 0.0
00232 set sumprod 0.0
00233
00234 foreach value1 $data1 value2 $data2 {
00235 if { $value1 == {} || $value2 == {} } {
00236 continue
00237 }
00238 set value1 [expr {double($value1)}]
00239 set value2 [expr {double($value2)}]
00240
00241 set sum1 [expr {$sum1+$value1}]
00242 set sum2 [expr {$sum2+$value2}]
00243 set sumsq1 [expr {$sumsq1+$value1*$value1}]
00244 set sumsq2 [expr {$sumsq2+$value2*$value2}]
00245 set sumprod [expr {$sumprod+$value1*$value2}]
00246 incr number
00247 }
00248 if { $number > 0 } {
00249 set numerator [expr {$number*$sumprod-$sum1*$sum2}]
00250 set denom1 [expr {sqrt($number*$sumsq1-$sum1*$sum1)}]
00251 set denom2 [expr {sqrt($number*$sumsq2-$sum2*$sum2)}]
00252 if { $denom1 != 0.0 && $denom2 != 0.0 } {
00253 set corr_coeff [expr {$numerator/$denom1/$denom2}]
00254 } elseif { $denom1 != 0.0 || $denom2 != 0.0 } {
00255 set corr_coeff 0.0 ;# Uniform against non-uniform
00256 } else {
00257 set corr_coeff 1.0 ;# Both uniform
00258 }
00259
00260 } else {
00261 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
00262 }
00263 return $corr_coeff
00264 }
00265
00266
00267
00268
00269
00270
00271
00272
00273
00274
00275
00276 ret ::math::statistics::lillieforsFit (type values) {
00277 #
00278 # calculate the goodness of fit according to Lilliefors
00279 # (goodness of fit to a normal distribution)
00280 #
00281 # values -> list of values to be tested for normality
00282 # (these values are sampled counts)
00283 #
00284
00285 # calculate standard deviation and mean of the sample:
00286 set n [llength $values]
00287 if { $n < 5 } {
00288 return -code error "Insufficient number of data (at least five required)"
00289 }
00290 set sd [stdev $values]
00291 set mean [mean $values]
00292
00293 # sort the sample for further processing:
00294 set values [lsort -real $values]
00295
00296 # standardize the sample data (Z-scores):
00297 foreach x $values {
00298 lappend stdData [expr {($x - $mean)/double($sd)}]
00299 }
00300
00301 # compute the value of the distribution function at every sampled point:
00302 foreach x $stdData {
00303 lappend expData [pnorm $x]
00304 }
00305
00306 # compute D+:
00307 set i 0
00308 foreach x $expData {
00309 incr i
00310 lappend dplus [expr {$i/double($n)-$x}]
00311 }
00312 set dplus [lindex [lsort -real $dplus] end]
00313
00314 # compute D-:
00315 set i 0
00316 foreach x $expData {
00317 incr i
00318 lappend dminus [expr {$x-($i-1)/double($n)}]
00319 }
00320 set dminus [lindex [lsort -real $dminus] end]
00321
00322 # Calculate the test statistic D
00323 # by finding the maximal vertical difference
00324 # between the sample and the expectation:
00325 #
00326 set D [expr {$dplus > $dminus ? $dplus : $dminus}]
00327
00328 # We now use the modified statistic Z,
00329 # because D is only reliable
00330 # if the p-value is smaller than 0.1
00331 return [expr {$D * (sqrt($n) - 0.01 + 0.831/sqrt($n))}]
00332 }
00333
00334
00335
00336
00337
00338
00339
00340
00341
00342
00343
00344
00345 ret ::math::statistics::pnorm (type x) {
00346 #
00347 # cumulative distribution function (cdf)
00348 # for the standard normal distribution like in the statistical software 'R'
00349 # (mean=0 and sd=1)
00350 #
00351 # x -> value for which the cdf should be calculated
00352 #
00353 set sum [expr {double($x)}]
00354 set oldSum 0.0
00355 set i 1
00356 set denom 1.0
00357 while {$sum != $oldSum} {
00358 set oldSum $sum
00359 incr i 2
00360 set denom [expr {$denom*$i}]
00361 #puts "$i - $denom"
00362 set sum [expr {$oldSum + pow($x,$i)/$denom}]
00363 }
00364 return [expr {0.5 + $sum * exp(-0.5 * $x*$x - 0.91893853320467274178)}]
00365 }
00366
00367
00368
00369
00370
00371
00372
00373
00374
00375
00376
00377
00378 ret ::math::statistics::pnorm_quicker (type x) {
00379
00380 set n [expr {abs($x)}]
00381 set n [expr {1.0 + $n*(0.04986735 + $n*(0.02114101 + $n*(0.00327763 \
00382 + $n*(0.0000380036 + $n*(0.0000488906 + $n*0.000005383)))))}]
00383 set n [expr {1.0/pow($n,16)}]
00384 #
00385 if {$x >= 0} {
00386 return [expr {1 - $n/2.0}]
00387 } else {
00388 return [expr {$n/2.0}]
00389 }
00390 }
00391
00392
00393
00394
00395
00396
00397
00398
00399
00400
00401
00402 ret ::math::statistics::test-normal (type data , type confidence) {
00403 set D [lillieforsFit $data]
00404
00405 set Dcrit --
00406 if { abs($confidence-0.80) < 0.0001 } {
00407 set Dcrit 0.741
00408 }
00409 if { abs($confidence-0.85) < 0.0001 } {
00410 set Dcrit 0.775
00411 }
00412 if { abs($confidence-0.90) < 0.0001 } {
00413 set Dcrit 0.819
00414 }
00415 if { abs($confidence-0.95) < 0.0001 } {
00416 set Dcrit 0.895
00417 }
00418 if { abs($confidence-0.99) < 0.0001 } {
00419 set Dcrit 1.035
00420 }
00421 if { $Dcrit != "--" } {
00422 return [expr {$D > $Dcrit ? 1 : 0 }]
00423 } else {
00424 return -code error "Confidence level must be one of: 0.80, 0.85, 0.90, 0.95 or 0.99"
00425 }
00426 }
00427
00428
00429
00430
00431
00432
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443 ret ::math::statistics::t-test-mean ( type data , type est_, type mean , type est_, type stdev , type confidence ) {
00444 variable NEGSTDEV
00445 variable TOOFEWDATA
00446
00447 if { $est_stdev <= 0.0 } {
00448 return -code error -errorcode ARG -errorinfo $NEGSTDEV $NEGSTDEV
00449 }
00450
00451 set allstats [BasicStats all $data]
00452
00453 set conf2 [expr {(1.0+$confidence)/2.0}]
00454
00455 set sample_mean [lindex $allstats 0]
00456 set sample_number [lindex $allstats 3]
00457
00458 if { $sample_number > 1 } {
00459 set tzero [expr {abs($sample_mean-$est_mean)/$est_stdev * \
00460 sqrt($sample_number-1)}]
00461 set degrees [expr {$sample_number-1}]
00462 set prob [cdf-students-t $degrees $tzero]
00463
00464 return [expr {$prob<$conf2}]
00465
00466 } else {
00467 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
00468 }
00469 }
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481
00482
00483
00484
00485 ret ::math::statistics::interval-mean-stdev ( type data , type confidence ) {
00486 variable TOOFEWDATA
00487 variable student_t_table
00488
00489 set allstats [BasicStats all $data]
00490
00491 set conf2 [expr {(1.0+$confidence)/2.0}]
00492 set mean [lindex $allstats 0]
00493 set number [lindex $allstats 3]
00494 set stdev [lindex $allstats 4]
00495
00496 if { $number > 1 } {
00497 set degrees [expr {$number-1}]
00498 set student_t \
00499 [::math::interpolation::interpolate2d $student_t_table \
00500 $degrees $conf2]
00501 set mean_lower [expr {$mean-$student_t*$stdev/sqrt($number)}]
00502 set mean_upper [expr {$mean+$student_t*$stdev/sqrt($number)}]
00503 set stdev_lower {}
00504 set stdev_upper {}
00505 return [list $mean_lower $mean_upper $stdev_lower $stdev_upper]
00506 } else {
00507 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
00508 }
00509 }
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519
00520
00521
00522
00523
00524
00525
00526 ret ::math::statistics::quantiles ( type arg1 , type arg2 , optional arg3 ={) } {
00527 variable TOOFEWDATA
00528
00529 if { [catch {
00530 if { $arg3 == {} } {
00531 set result \
00532 [::math::statistics::QuantilesRawData $arg1 $arg2]
00533 } else {
00534 set result \
00535 [::math::statistics::QuantilesHistogram $arg1 $arg2 $arg3]
00536 }
00537 } msg] } {
00538 return -code error -errorcode $msg $msg
00539 }
00540 return $result
00541 }
00542
00543
00544
00545
00546
00547
00548
00549
00550
00551
00552
00553 ret ::math::statistics::QuantilesRawData ( type data , type confidence ) {
00554 variable TOOFEWDATA
00555 variable OUTOFRANGE
00556
00557 if { [llength $confidence] <= 0 } {
00558 return -code error -errorcode ARG "$TOOFEWDATA - quantiles"
00559 }
00560
00561 if { [llength $data] <= 0 } {
00562 return -code error -errorcode ARG "$TOOFEWDATA - raw data"
00563 }
00564
00565 foreach cond $confidence {
00566 if { $cond <= 0.0 || $cond >= 1.0 } {
00567 return -code error -errorcode ARG "$OUTOFRANGE - quantiles"
00568 }
00569 }
00570
00571 #
00572 # Sort the data first
00573 #
00574 set sorted_data [lsort -real -increasing $data]
00575
00576 #
00577 # Determine the list element lower or equal to the quantile
00578 # and return the corresponding value
00579 #
00580 set result {}
00581 set number_data [llength $sorted_data]
00582 foreach cond $confidence {
00583 set elem [expr {round($number_data*$cond)-1}]
00584 if { $elem < 0 } {
00585 set elem 0
00586 }
00587 lappend result [lindex $sorted_data $elem]
00588 }
00589
00590 return $result
00591 }
00592
00593
00594
00595
00596
00597
00598
00599
00600
00601
00602
00603
00604 ret ::math::statistics::QuantilesHistogram ( type limits , type counts , type confidence ) {
00605 variable TOOFEWDATA
00606 variable OUTOFRANGE
00607
00608 if { [llength $confidence] <= 0 } {
00609 return -code error -errorcode ARG "$TOOFEWDATA - quantiles"
00610 }
00611
00612 if { [llength $confidence] <= 0 } {
00613 return -code error -errorcode ARG "$TOOFEWDATA - histogram limits"
00614 }
00615
00616 if { [llength $counts] <= [llength $limits] } {
00617 return -code error -errorcode ARG "$TOOFEWDATA - histogram counts"
00618 }
00619
00620 foreach cond $confidence {
00621 if { $cond <= 0.0 || $cond >= 1.0 } {
00622 return -code error -errorcode ARG "$OUTOFRANGE - quantiles"
00623 }
00624 }
00625
00626 #
00627 # Accumulate the histogram counts first
00628 #
00629 set sum 0
00630 set accumulated_counts {}
00631 foreach count $counts {
00632 set sum [expr {$sum+$count}]
00633 lappend accumulated_counts $sum
00634 }
00635 set total_counts $sum
00636
00637 #
00638 # Determine the list element lower or equal to the quantile
00639 # and return the corresponding value (use interpolation if
00640 # possible)
00641 #
00642 set result {}
00643 foreach cond $confidence {
00644 set found 0
00645 set bound [expr {round($total_counts*$cond)}]
00646 set lower_limit {}
00647 set lower_count 0
00648 foreach acc_count $accumulated_counts limit $limits {
00649 if { $acc_count >= $bound } {
00650 set found 1
00651 break
00652 }
00653 set lower_limit $limit
00654 set lower_count $acc_count
00655 }
00656
00657 if { $lower_limit == {} || $limit == {} || $found == 0 } {
00658 set quant $limit
00659 if { $limit == {} } {
00660 set quant $lower_limit
00661 }
00662 } else {
00663 set quant [expr {$limit+($lower_limit-$limit) *
00664 ($acc_count-$bound)/($acc_count-$lower_count)}]
00665 }
00666 lappend result $quant
00667 }
00668
00669 return $result
00670 }
00671
00672
00673
00674
00675
00676
00677
00678
00679
00680
00681
00682 ret ::math::statistics::autocorr ( type data ) {
00683 variable TOOFEWDATA
00684
00685 if { [llength $data] <= 1 } {
00686 return -code error -errorcode ARG "$TOOFEWDATA"
00687 }
00688
00689 return [crosscorr $data $data]
00690 }
00691
00692
00693
00694
00695
00696
00697
00698
00699
00700
00701
00702
00703
00704
00705
00706
00707 ret ::math::statistics::crosscorr ( type data1 , type data2 ) {
00708 variable TOOFEWDATA
00709
00710 if { [llength $data1] <= 1 || [llength $data2] <= 1 } {
00711 return -code error -errorcode ARG "$TOOFEWDATA"
00712 }
00713
00714 #
00715 # First determine the number of data pairs
00716 #
00717 set number1 [llength $data1]
00718 set number2 [llength $data2]
00719
00720 set basic_stat1 [basic-stats $data1]
00721 set basic_stat2 [basic-stats $data2]
00722 set vmean1 [lindex $basic_stat1 0]
00723 set vmean2 [lindex $basic_stat2 0]
00724 set vvar1 [lindex $basic_stat1 end]
00725 set vvar2 [lindex $basic_stat2 end]
00726
00727 set number_pairs $number1
00728 if { $number1 > $number2 } {
00729 set number_pairs $number2
00730 }
00731 set number_values $number_pairs
00732 set number_delays [expr {$number_values/2.0}]
00733
00734 set scale [expr {sqrt($vvar1*$vvar2)}]
00735
00736 set result {}
00737 for { set delay 0 } { $delay < $number_delays } { incr delay } {
00738 set sumcross 0.0
00739 set no_cross 0
00740 for { set idx 0 } { $idx < $number_values } { incr idx } {
00741 set value1 [lindex $data1 $idx]
00742 set value2 [lindex $data2 [expr {$idx+$delay}]]
00743 if { $value1 != {} && $value2 != {} } {
00744 set sumcross \
00745 [expr {$sumcross+($value1-$vmean1)*($value2-$vmean2)}]
00746 incr no_cross
00747 }
00748 }
00749 lappend result [expr {$sumcross/($no_cross*$scale)}]
00750
00751 incr number_values -1
00752 }
00753
00754 return $result
00755 }
00756
00757
00758
00759
00760
00761
00762
00763
00764
00765
00766
00767
00768
00769 ret ::math::statistics::mean-histogram-limits ( type mean , type stdev , optional number =8 ) {
00770 variable NEGSTDEV
00771
00772 if { $stdev <= 0.0 } {
00773 return -code error -errorcode ARG "$NEGSTDEV"
00774 }
00775 if { $number < 1 } {
00776 return -code error -errorcode ARG "Number of limits must be positive"
00777 }
00778
00779 #
00780 # Always: between mean-3.0*stdev and mean+3.0*stdev
00781 # number = 2: -0.25, 0.25
00782 # number = 3: -0.25, 0, 0.25
00783 # number = 4: -1, -0.25, 0.25, 1
00784 # number = 5: -1, -0.25, 0, 0.25, 1
00785 # number = 6: -2, -1, -0.25, 0.25, 1, 2
00786 # number = 7: -2, -1, -0.25, 0, 0.25, 1, 2
00787 # number = 8: -3, -2, -1, -0.25, 0.25, 1, 2, 3
00788 #
00789 switch -- $number {
00790 "1" { set limits {0.0} }
00791 "2" { set limits {-0.25 0.25} }
00792 "3" { set limits {-0.25 0.0 0.25} }
00793 "4" { set limits {-1.0 -0.25 0.25 1.0} }
00794 "5" { set limits {-1.0 -0.25 0.0 0.25 1.0} }
00795 "6" { set limits {-2.0 -1.0 -0.25 0.25 1.0 2.0} }
00796 "7" { set limits {-2.0 -1.0 -0.25 0.0 0.25 1.0 2.0} }
00797 "8" { set limits {-3.0 -2.0 -1.0 -0.25 0.25 1.0 2.0 3.0} }
00798 "9" { set limits {-3.0 -2.0 -1.0 -0.25 0.0 0.25 1.0 2.0 3.0} }
00799 default {
00800 set dlim [expr {6.0/double($number-1)}]
00801 for {set i 0} {$i <$number} {incr i} {
00802 lappend limits [expr {$dlim*($i-($number-1)/2.0)}]
00803 }
00804 }
00805 }
00806
00807 set result {}
00808 foreach limit $limits {
00809 lappend result [expr {$mean+$limit*$stdev}]
00810 }
00811
00812 return $result
00813 }
00814
00815
00816
00817
00818
00819
00820
00821
00822
00823
00824
00825
00826
00827 ret ::math::statistics::minmax-histogram-limits ( type min , type max , optional number =8 ) {
00828 variable NEGSTDEV
00829
00830 if { $number < 1 } {
00831 return -code error -errorcode ARG "Number of limits must be positive"
00832 }
00833 if { $min >= $max } {
00834 return -code error -errorcode ARG "Minimum must be lower than maximum"
00835 }
00836
00837 set result {}
00838 set dlim [expr {($max-$min)/double($number-1)}]
00839 for {set i 0} {$i <$number} {incr i} {
00840 lappend result [expr {$min+$dlim*$i}]
00841 }
00842
00843 return $result
00844 }
00845
00846
00847
00848
00849
00850
00851
00852
00853
00854
00855
00856
00857
00858
00859
00860
00861
00862
00863
00864
00865
00866
00867
00868 ret ::math::statistics::linear-model ( type xdata , type ydata , optional intercept =1 ) {
00869 variable TOOFEWDATA
00870
00871 if { [llength $xdata] < 3 } {
00872 return -code error -errorcode ARG "$TOOFEWDATA: not enough independent data"
00873 }
00874 if { [llength $ydata] < 3 } {
00875 return -code error -errorcode ARG "$TOOFEWDATA: not enough dependent data"
00876 }
00877 if { [llength $xdata] != [llength $ydata] } {
00878 return -code error -errorcode ARG "$TOOFEWDATA: number of dependent data differs from number of independent data"
00879 }
00880
00881 set sumx 0.0
00882 set sumy 0.0
00883 set sumx2 0.0
00884 set sumy2 0.0
00885 set sumxy 0.0
00886 set df 0
00887 foreach x $xdata y $ydata {
00888 if { $x != "" && $y != "" } {
00889 set sumx [expr {$sumx+$x}]
00890 set sumy [expr {$sumy+$y}]
00891 set sumx2 [expr {$sumx2+$x*$x}]
00892 set sumy2 [expr {$sumy2+$y*$y}]
00893 set sumxy [expr {$sumxy+$x*$y}]
00894 incr df
00895 }
00896 }
00897
00898 if { $df <= 2 } {
00899 return -code error -errorcode ARG "$TOOFEWDATA: too few valid data"
00900 }
00901 if { $sumx2 == 0.0 } {
00902 return -code error -errorcode ARG "$TOOFEWDATA: independent values are all the same"
00903 }
00904
00905 #
00906 # Calculate the intermediate quantities
00907 #
00908 set sx [expr {$sumx2-$sumx*$sumx/$df}]
00909 set sy [expr {$sumy2-$sumy*$sumy/$df}]
00910 set sxy [expr {$sumxy-$sumx*$sumy/$df}]
00911
00912 #
00913 # Calculate the coefficients
00914 #
00915 if { $intercept } {
00916 set B [expr {$sxy/$sx}]
00917 set A [expr {($sumy-$B*$sumx)/$df}]
00918 } else {
00919 set B [expr {$sumxy/$sumx2}]
00920 set A 0.0
00921 }
00922
00923 #
00924 # Calculate the error estimates
00925 #
00926 set stdevY 0.0
00927 set varY 0.0
00928
00929 if { $intercept } {
00930 set ve [expr {$sy-$B*$sxy}]
00931 if { $ve >= 0.0 } {
00932 set varY [expr {$ve/($df-2)}]
00933 }
00934 } else {
00935 set ve [expr {$sumy2-$B*$sumxy}]
00936 if { $ve >= 0.0 } {
00937 set varY [expr {$ve/($df-1)}]
00938 }
00939 }
00940 set seY [expr {sqrt($varY)}]
00941
00942 if { $intercept } {
00943 set R2 [expr {$sxy*$sxy/($sx*$sy)}]
00944 set seA [expr {$seY*sqrt(1.0/$df+$sumx*$sumx/($sx*$df*$df))}]
00945 set seB [expr {sqrt($varY/$sx)}]
00946 set tA {}
00947 set tB {}
00948 if { $seA != 0.0 } {
00949 set tA [expr {$A/$seA*sqrt($df-2)}]
00950 }
00951 if { $seB != 0.0 } {
00952 set tB [expr {$B/$seB*sqrt($df-2)}]
00953 }
00954 } else {
00955 set R2 [expr {$sumxy*$sumxy/($sumx2*$sumy2)}]
00956 set seA {}
00957 set tA {}
00958 set tB {}
00959 set seB [expr {sqrt($varY/$sumx2)}]
00960 if { $seB != 0.0 } {
00961 set tB [expr {$B/$seB*sqrt($df-1)}]
00962 }
00963 }
00964
00965 #
00966 # Return the list of parameters
00967 #
00968 return [list $A $B $seY $R2 $df $seA $tA $seB $tB]
00969 }
00970
00971
00972
00973
00974
00975
00976
00977
00978
00979
00980
00981
00982
00983 ret ::math::statistics::linear-residuals ( type xdata , type ydata , optional intercept =1 ) {
00984 variable TOOFEWDATA
00985
00986 if { [llength $xdata] < 3 } {
00987 return -code error -errorcode ARG "$TOOFEWDATA: no independent data"
00988 }
00989 if { [llength $ydata] < 3 } {
00990 return -code error -errorcode ARG "$TOOFEWDATA: no dependent data"
00991 }
00992 if { [llength $xdata] != [llength $ydata] } {
00993 return -code error -errorcode ARG "$TOOFEWDATA: number of dependent data differs from number of independent data"
00994 }
00995
00996 foreach {A B} [linear-model $xdata $ydata $intercept] {break}
00997
00998 set result {}
00999 foreach x $xdata y $ydata {
01000 set residue [expr {$y-$A-$B*$x}]
01001 lappend result $residue
01002 }
01003 return $result
01004 }
01005
01006
01007
01008
01009
01010
01011
01012
01013
01014
01015
01016
01017
01018
01019 ret ::math::statistics::median ( type data ) {
01020 set org_data $data
01021 set data {}
01022 foreach value $org_data {
01023 if { $value != {} } {
01024 lappend data $value
01025 }
01026 }
01027 set len [llength $data]
01028
01029 set data [lsort -real $data]
01030 if { $len % 2 } {
01031 lindex $data [expr {($len-1)/2}]
01032 } else {
01033 expr {([lindex $data [expr {($len / 2) - 1}]] \
01034 + [lindex $data [expr {$len / 2}]]) / 2.0}
01035 }
01036 }
01037
01038
01039
01040
01041
01042
01043
01044
01045
01046
01047
01048
01049
01050
01051
01052
01053 ret ::math::statistics::test-2x2 ( type a , type b , type c , type d ) {
01054 set ab [expr {$a+$b}]
01055 set ac [expr {$a+$c}]
01056 set bd [expr {$b+$d}]
01057 set cd [expr {$c+$d}]
01058 set N [expr {$a+$b+$c+$d}]
01059 set det [expr {$a*$d-$b*$c}]
01060 set result [expr {double($N*$det*$det)/double($ab*$cd*$ac*$bd)}]
01061 }
01062
01063
01064
01065
01066
01067
01068
01069
01070
01071
01072
01073
01074 ret ::math::statistics::print-2x2 ( type a , type b , type c , type d ) {
01075 set ab [expr {$a+$b}]
01076 set ac [expr {$a+$c}]
01077 set bd [expr {$b+$d}]
01078 set cd [expr {$c+$d}]
01079 set N [expr {$a+$b+$c+$d}]
01080 set chisq [test-2x2 $a $b $c $d]
01081
01082 set line [string repeat - 10]
01083 set result [format "%10d%10d | %10d\n" $a $b $ab]
01084 append result [format "%10d%10d | %10d\n" $c $d $cd]
01085 append result [format "%10s%10s + %10s\n" $line $line $line]
01086 append result [format "%10d%10d | %10d\n" $ac $bd $N]
01087 append result "Chisquare = $chisq\n"
01088 append result "Difference is significant?\n"
01089 append result " at 95%: [expr {$chisq<3.84146? "no":"yes"}]\n"
01090 append result " at 99%: [expr {$chisq<6.63490? "no":"yes"}]"
01091 }
01092
01093
01094
01095
01096
01097
01098
01099
01100
01101
01102
01103 ret ::math::statistics::control-xbar ( type data , optional nsamples =4 ) {
01104 variable TOOFEWDATA
01105 variable control_factors
01106
01107 #
01108 # Check the number of data
01109 #
01110 if { $nsamples <= 1 } {
01111 return -code error -errorcode DATA -errorinfo $OUTOFRANGE \
01112 "Number of data per subsample must be at least 2"
01113 }
01114 if { [llength $data] < 20*$nsamples } {
01115 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
01116 }
01117
01118 set nogroups [expr {[llength $data]/$nsamples}]
01119 set mrange 0.0
01120 set xmeans 0.0
01121 for { set i 0 } { $i < $nogroups } { incr i } {
01122 set subsample [lrange $data [expr {$i*$nsamples}] [expr {$i*$nsamples+$nsamples-1}]]
01123
01124 set xmean 0.0
01125 set xmin [lindex $subsample 0]
01126 set xmax $xmin
01127 foreach d $subsample {
01128 set xmean [expr {$xmean+$d}]
01129 set xmin [expr {$xmin<$d? $xmin : $d}]
01130 set xmax [expr {$xmax>$d? $xmax : $d}]
01131 }
01132 set xmean [expr {$xmean/double($nsamples)}]
01133
01134 set xmeans [expr {$xmeans+$xmean}]
01135 set mrange [expr {$mrange+($xmax-$xmin)}]
01136 }
01137
01138 #
01139 # Determine the control lines
01140 #
01141 set xmeans [expr {$xmeans/double($nogroups)}]
01142 set mrange [expr {$mrange/double($nogroups)}]
01143 set A2 [lindex [lindex $control_factors 1] $nsamples]
01144 if { $A2 == "" } { set A2 [lindex [lindex $control_factors 1] end] }
01145
01146 return [list $xmeans [expr {$xmeans-$A2*$mrange}] \
01147 [expr {$xmeans+$A2*$mrange}] $nsamples]
01148 }
01149
01150
01151
01152
01153
01154
01155
01156
01157
01158
01159 ret ::math::statistics::test-xbar ( type control , type data ) {
01160 foreach {xmean xlower xupper nsamples} $control {break}
01161
01162 if { [llength $data] < 1 } {
01163 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
01164 }
01165
01166 set nogroups [expr {[llength $data]/$nsamples}]
01167 if { $nogroups <= 0 } {
01168 set nogroup 1
01169 set nsamples [llength $data]
01170 }
01171
01172 set result {}
01173
01174 for { set i 0 } { $i < $nogroups } { incr i } {
01175 set subsample [lrange $data [expr {$i*$nsamples}] [expr {$i*$nsamples+$nsamples-1}]]
01176
01177 set xmean 0.0
01178 foreach d $subsample {
01179 set xmean [expr {$xmean+$d}]
01180 }
01181 set xmean [expr {$xmean/double($nsamples)}]
01182
01183 if { $xmean < $xlower } { lappend result $i }
01184 if { $xmean > $xupper } { lappend result $i }
01185 }
01186
01187 return $result
01188 }
01189
01190
01191
01192
01193
01194
01195
01196
01197
01198
01199
01200 ret ::math::statistics::control-Rchart ( type data , optional nsamples =4 ) {
01201 variable TOOFEWDATA
01202 variable control_factors
01203
01204 #
01205 # Check the number of data
01206 #
01207 if { $nsamples <= 1 } {
01208 return -code error -errorcode DATA -errorinfo $OUTOFRANGE \
01209 "Number of data per subsample must be at least 2"
01210 }
01211 if { [llength $data] < 20*$nsamples } {
01212 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
01213 }
01214
01215 set nogroups [expr {[llength $data]/$nsamples}]
01216 set mrange 0.0
01217 for { set i 0 } { $i < $nogroups } { incr i } {
01218 set subsample [lrange $data [expr {$i*$nsamples}] [expr {$i*$nsamples+$nsamples-1}]]
01219
01220 set xmin [lindex $subsample 0]
01221 set xmax $xmin
01222 foreach d $subsample {
01223 set xmin [expr {$xmin<$d? $xmin : $d}]
01224 set xmax [expr {$xmax>$d? $xmax : $d}]
01225 }
01226 set mrange [expr {$mrange+($xmax-$xmin)}]
01227 }
01228
01229 #
01230 # Determine the control lines
01231 #
01232 set mrange [expr {$mrange/double($nogroups)}]
01233 set D3 [lindex [lindex $control_factors 3] $nsamples]
01234 set D4 [lindex [lindex $control_factors 5] $nsamples]
01235 if { $D3 == "" } { set D3 [lindex [lindex $control_factors 3] end] }
01236 if { $D4 == "" } { set D4 [lindex [lindex $control_factors 5] end] }
01237
01238 return [list $mrange [expr {$D3*$mrange}] \
01239 [expr {$D4*$mrange}] $nsamples]
01240 }
01241
01242
01243
01244
01245
01246
01247
01248
01249
01250
01251 ret ::math::statistics::test-Rchart ( type control , type data ) {
01252 foreach {rmean rlower rupper nsamples} $control {break}
01253
01254 #
01255 # Check the number of data
01256 #
01257 if { [llength $data] < 1 } {
01258 return -code error -errorcode DATA -errorinfo $TOOFEWDATA $TOOFEWDATA
01259 }
01260
01261 set nogroups [expr {[llength $data]/$nsamples}]
01262
01263 set result {}
01264 for { set i 0 } { $i < $nogroups } { incr i } {
01265 set subsample [lrange $data [expr {$i*$nsamples}] [expr {$i*$nsamples+$nsamples-1}]]
01266
01267 set xmin [lindex $subsample 0]
01268 set xmax $xmin
01269 foreach d $subsample {
01270 set xmin [expr {$xmin<$d? $xmin : $d}]
01271 set xmax [expr {$xmax>$d? $xmax : $d}]
01272 }
01273 set range [expr {$xmax-$xmin}]
01274
01275 if { $range < $rlower } { lappend result $i }
01276 if { $range > $rupper } { lappend result $i }
01277 }
01278
01279 return $result
01280 }
01281
01282
01283
01284
01285
01286
01287 source [file join [file dirname [info script]] pdf_stat.tcl]
01288 source [file join [file dirname [info script]] plotstat.tcl]
01289 source [file join [file dirname [info script]] liststat.tcl]
01290 source [file join [file dirname [info script]] mvlinreg.tcl]
01291
01292
01293
01294
01295 namespace ::math::statistics {
01296 variable student_t_table
01297
01298
01299
01300
01301
01302
01303
01304
01305
01306
01307
01308
01309
01310
01311
01312
01313
01314
01315 }
01316
01317
01318
01319
01320 if { [info exists ::argv0] && ([file tail [info script]] == [file tail $::argv0]) } {
01321
01322 console show
01323 puts [interp aliases]
01324
01325 values = {1 1 1 1 {}}
01326 puts [::math::statistics::basic-stats $values]
01327 values = {1 2 3 4}
01328 puts [::math::statistics::basic-stats $values]
01329 values = {1 -1 1 -2}
01330 puts [::math::statistics::basic-stats $values]
01331 puts [::math::statistics::mean $values]
01332 puts [::math::statistics::min $values]
01333 puts [::math::statistics::max $values]
01334 puts [::math::statistics::number $values]
01335 puts [::math::statistics::stdev $values]
01336 puts [::math::statistics::var $values]
01337
01338 novals = 100
01339
01340 maxvals = 1001
01341 while { $novals < $maxvals } {
01342 values = {}
01343 for { i = 0 } { $i < $novals } { incr i } {
01344 lappend values [expr {rand()}]
01345 }
01346 puts [::math::statistics::basic-stats $values]
01347 puts [::math::statistics::histogram {0.0 0.2 0.4 0.6 0.8 1.0} $values]
01348 novals = [expr {$novals*10}]
01349 }
01350
01351 puts "Normal distribution:"
01352 puts "X=0: [::math::statistics::pdf-normal 0.0 1.0 0.0]"
01353 puts "X=1: [::math::statistics::pdf-normal 0.0 1.0 1.0]"
01354 puts "X=-1: [::math::statistics::pdf-normal 0.0 1.0 -1.0]"
01355
01356 data1 = {0.0 1.0 3.0 4.0 100.0 -23.0}
01357 data2 = {1.0 2.0 4.0 5.0 101.0 -22.0}
01358 data3 = {0.0 2.0 6.0 8.0 200.0 -46.0}
01359 data4 = {2.0 6.0 8.0 200.0 -46.0 1.0}
01360 data5 = {100.0 99.0 90.0 93.0 5.0 123.0}
01361 puts "Correlation data1 and data1: [::math::statistics::corr $data1 $data1]"
01362 puts "Correlation data1 and data2: [::math::statistics::corr $data1 $data2]"
01363 puts "Correlation data1 and data3: [::math::statistics::corr $data1 $data3]"
01364 puts "Correlation data1 and data4: [::math::statistics::corr $data1 $data4]"
01365 puts "Correlation data1 and data5: [::math::statistics::corr $data1 $data5]"
01366
01367
01368
01369
01370
01371
01372
01373
01374
01375
01376
01377
01378
01379 rc = [catch {
01380 m = [::math::statistics::mean {}]
01381 } msg ] ;
01382 puts "Result: $rc $msg"
01383
01384 puts "\nTest quantiles:"
01385 data = {1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 10.0}
01386 quantiles = {0.11 0.21 0.51 0.91 0.99}
01387 limits = {2.1 4.1 6.1 8.1}
01388 puts [::math::statistics::quantiles $data $quantiles]
01389
01390 histogram = [::math::statistics::histogram $limits $data]
01391 puts [::math::statistics::quantiles $limits $histogram $quantiles]
01392
01393 puts "\nTest autocorrelation:"
01394 data = {1.0 -1.0 1.0 -1.0 1.0 -1.0 1.0 -1.0 1.0}
01395 puts [::math::statistics::autocorr $data]
01396 data = {1.0 -1.1 2.0 -0.6 3.0 -4.0 0.5 0.9 -1.0}
01397 puts [::math::statistics::autocorr $data]
01398
01399 puts "\nTest histogram limits:"
01400 puts [::math::statistics::mean-histogram-limits 1.0 1.0]
01401 puts [::math::statistics::mean-histogram-limits 1.0 1.0 4]
01402 puts [::math::statistics::minmax-histogram-limits 1.0 10.0 10]
01403
01404 }
01405
01406
01407
01408
01409 if { 0 } {
01410 data = {}
01411 for { i = 0 } { $i < 500 } { incr i } {
01412 lappend data [expr {rand()}]
01413 }
01414 limits = [::math::statistics::control-xbar $data]
01415 puts $limits
01416
01417 puts "Outliers? [::math::statistics::test-xbar $limits $data]"
01418
01419 newdata = {1.0 1.0 1.0 1.0 0.5 0.5 0.5 0.5 10.0 10.0 10.0 10.0}
01420 puts "Outliers? [::math::statistics::test-xbar $limits $newdata] -- 0 2"
01421
01422 limits = [::math::statistics::control-Rchart $data]
01423 puts $limits
01424
01425 puts "Outliers? [::math::statistics::test-Rchart $limits $data]"
01426
01427 newdata = {0.0 1.0 2.0 1.0 0.4 0.5 0.6 0.5 10.0 0.0 10.0 10.0}
01428 puts "Outliers? [::math::statistics::test-Rchart $limits $newdata] -- 0 2"
01429 }
01430
01431