mvlinreg.tcl

Go to the documentation of this file.
00001 /*  mvreglin.tcl --*/
00002 /*      Addition to the statistics package*/
00003 /*      Copyright 2007 Eric Kemp-Benedict*/
00004 /*      Released under the BSD license under any terms*/
00005 /*      that allow it to be compatible with tcllib*/
00006 
00007 package require math::linearalgebra 1.0
00008 package require math::statistics 0.4
00009 
00010 /*  ::math::statistics --*/
00011 /*      This file adds:*/
00012 /*      mvlinreg = Multivariate Linear Regression*/
00013 /* */
00014 namespace ::math::statistics {
00015     variable epsilon 1.0e-7
00016 
00017     namespace export tstat mv-wls mv-ols
00018 
00019     namespace import -force \
00020         ::math::linearalgebra::mkMatrix \
00021         ::math::linearalgebra::mkVector \
00022         ::math::linearalgebra::mkIdentity \
00023         ::math::linearalgebra::mkDiagonal \
00024         ::math::linearalgebra::getrow \
00025         ::math::linearalgebra::row =  \
00026         ::math::linearalgebra::getcol \
00027         ::math::linearalgebra::col =  \
00028         ::math::linearalgebra::getelem \
00029         ::math::linearalgebra::elem =  \
00030         ::math::linearalgebra::dotproduct \
00031         ::math::linearalgebra::matmul \
00032         ::math::linearalgebra::add \
00033         ::math::linearalgebra::sub \
00034         ::math::linearalgebra::solveGauss \
00035         ::math::linearalgebra::transpose
00036 }
00037 
00038 /*  tstats --*/
00039 /*      Returns inverse of the single-tailed t distribution*/
00040 /*      given number of degrees of freedom & confidence*/
00041 /* */
00042 /*  Arguments:*/
00043 /*      n        Number of degrees of freedom*/
00044 /*      alpha    Confidence level (defaults to 0.05)*/
00045 /* */
00046 /*  Result:*/
00047 /*      Inverse of the t-distribution*/
00048 /* */
00049 /*  Note:*/
00050 /*      Iterates until result is within epsilon of the target.*/
00051 /*      It is possible that the iteration does not converge*/
00052 /* */
00053 ret  ::math::statistics::tstat (type n , optional alpha =0.05) {
00054     variable epsilon
00055     variable tvals
00056 
00057     if [info exists tvals($n:$alpha)] {
00058         return $tvals($n:$alpha)
00059     }
00060 
00061     set deltat [expr {100 * $epsilon}]
00062     # For one-tailed distribution,
00063     set ptarg [expr {1.000 - $alpha/2.0}]
00064     set maxiter 100
00065 
00066     # Initial value for t
00067     set t 2.0
00068 
00069     set niter 0
00070     while {abs([::math::statistics::cdf-students-t $n $t] - $ptarg) > $epsilon} {
00071         set pstar [::math::statistics::cdf-students-t $n $t]
00072         set pl [::math::statistics::cdf-students-t $n [expr {$t - $deltat}]]
00073         set ph [::math::statistics::cdf-students-t $n [expr {$t + $deltat}]]
00074 
00075         set t [expr {$t + 2.0 * $deltat * ($ptarg - $pstar)/($ph - $pl)}]
00076 
00077         incr niter
00078         if {$niter == $maxiter} {
00079             return -code error "::math::statistics::tstat: Did not converge after $niter iterations"
00080         }
00081     }
00082 
00083     # Cache the result to shorten the call in future
00084     set tvals($n:$alpha) $t
00085 
00086     return $t
00087 }
00088 
00089 /*  mv-wls --*/
00090 /*      Weighted Least Squares*/
00091 /* */
00092 /*  Arguments:*/
00093 /*      data       Alternating list of weights and observations*/
00094 /* */
00095 /*  Result:*/
00096 /*      List containing:*/
00097 /*      * R-squared*/
00098 /*      * Adjusted R-squared*/
00099 /*      * Coefficients of x's in fit*/
00100 /*      * Standard errors of the coefficients*/
00101 /*      * 95% confidence bounds for coefficients*/
00102 /* */
00103 /*  Note:*/
00104 /*      The observations are lists starting with the dependent variable y*/
00105 /*      and then the values of the independent variables (x1, x2, ...):*/
00106 /* */
00107 /*      mv-wls [list w [list y x's] w [list y x's] ...]*/
00108 /* */
00109 ret  ::math::statistics::mv-wls (type data) {
00110 
00111     # Fill the matrices of x & y values, and weights
00112     # For n points, k coefficients
00113 
00114     # The number of points is equal to half the arguments (n weights, n points)
00115     set n [expr {[llength $data]/2}]
00116 
00117     set firstloop true
00118     # Sum up all y values to take an average
00119     set ysum 0
00120     # Add up the weights
00121     set wtsum 0
00122     # Count over rows (points) as you go
00123     set point 0
00124     foreach {wt pt} $data {
00125 
00126         # Check inputs
00127         if {[string is double $wt] == 0} {
00128             return -code error "::math::statistics::mv-wls: Weight \"$wt\" is not a number"
00129             return {}
00130         }
00131 
00132         ## -- Check dimensions, initialize
00133         if $firstloop {
00134             # k = num of vals in pt = 1 + number of x's (because of constant)
00135             set k [llength $pt]
00136             if {$n <= [expr {$k + 1}]} {
00137                 return -code error "::math::statistics::mv-wls: Too few degrees of freedom: $k variables but only $n points"
00138                 return {}
00139             }
00140             set X [mkMatrix $n $k]
00141             set y [mkVector $n]
00142             set I_x [mkIdentity $k]
00143             set I_y [mkIdentity $n]
00144 
00145             set firstloop false
00146 
00147         } else {
00148             # Have to have same number of x's for all points
00149             if {$k != [llength $pt]} {
00150                 return -code error "::math::statistics::mv-wls: Point \"$pt\" has wrong number of values (expected $k)"
00151                 # Clean up
00152                 return {}
00153             }
00154         }
00155 
00156         ## -- Extract values from set of points
00157         # Make a list of y values
00158         set yval [expr {double([lindex $pt 0])}]
00159         setelem y $point $yval
00160         set ysum [expr {$ysum + $wt * $yval}]
00161         set wtsum [expr {$wtsum + $wt}]
00162         # Add x-values to the x-matrix
00163         set xrow [lrange $pt 1 end]
00164         # Add the constant (value = 1.0)
00165         lappend xrow 1.0
00166         setrow X $point $xrow
00167         # Create list of weights & square root of weights
00168         lappend w [expr {double($wt)}]
00169         lappend sqrtw [expr {sqrt(double($wt))}]
00170 
00171         incr point
00172 
00173     }
00174 
00175     set ymean [expr {double($ysum)/$wtsum}]
00176     set W [mkDiagonal $w]
00177     set sqrtW [mkDiagonal $sqrtw]
00178 
00179     # Calculate sum os square differences for x's
00180     for {set r 0} {$r < $k} {incr r} {
00181         set xsqrsum 0.0
00182         set xmeansum 0.0
00183         # Calculate sum of squared differences as: sum(x^2) - (sum x)^2/n
00184         for {set t 0} {$t < $n} {incr t} {
00185             set xval [getelem $X $t $r]
00186             set xmeansum [expr {$xmeansum + double($xval)}]
00187             set xsqrsum [expr {$xsqrsum + double($xval * $xval)}]
00188         }
00189         lappend xsqr [expr {$xsqrsum - $xmeansum * $xmeansum/$n}]
00190     }
00191 
00192     ## -- Set up the X'WX matrix
00193     set XtW [matmul [::math::linearalgebra::transpose $X] $W]
00194     set XtWX [matmul $XtW $X]
00195 
00196     # Invert
00197     set M [solveGauss $XtWX $I_x]
00198 
00199     set beta [matmul $M [matmul $XtW $y]]
00200 
00201     ### -- Residuals & R-squared
00202     # 1) Generate list of diagonals of the hat matrix
00203     set H [matmul $X [matmul $M $XtW]]
00204     for {set i 0} {$i < $n} {incr i} {
00205         lappend h_ii [getelem $H $i $i]
00206     }
00207 
00208     set R [matmul $sqrtW [matmul [sub $I_y $H] $y]]
00209     set yhat [matmul $H $y]
00210 
00211     # 2) Generate list of residuals, sum of squared residuals, r-squared
00212     set sstot 0.0
00213     set ssreg 0.0
00214     # Note: Relying on representation of Vector as a list for y, yhat
00215     foreach yval $y wt $w yhatval $yhat {
00216         set sstot [expr {$sstot + $wt * ($yval - $ymean) * ($yval - $ymean)}]
00217         set ssreg [expr {$ssreg + $wt * ($yhatval - $ymean) * ($yhatval - $ymean)}]
00218     }
00219     set r2 [expr {double($ssreg)/$sstot}]
00220     set adjr2 [expr {1.0 - (1.0 - $r2) * ($n - 1)/($n - $k)}]
00221     set sumsqresid [dotproduct $R $R]
00222     set s2 [expr {double($sumsqresid) / double($n - $k)}]
00223 
00224     ### -- Confidence intervals for coefficients
00225     set tvalue [tstat [expr {$n - $k}]]
00226     for {set i 0} {$i < $k} {incr i} {
00227         set stderr [expr {sqrt($s2 * [getelem $M $i $i])}]
00228         set mid [lindex $beta $i]
00229         lappend stderrs $stderr
00230         lappend confinterval [list [expr {$mid - $tvalue * $stderr}] [expr {$mid + $tvalue * $stderr}]]
00231     }
00232 
00233     return [list $r2 $adjr2 $beta $stderrs $confinterval]
00234 }
00235 
00236 /*  mv-ols --*/
00237 /*      Ordinary Least Squares*/
00238 /* */
00239 /*  Arguments:*/
00240 /*      data       List of observations, list of lists*/
00241 /* */
00242 /*  Result:*/
00243 /*      List containing:*/
00244 /*      * R-squared*/
00245 /*      * Adjusted R-squared*/
00246 /*      * Coefficients of x's in fit*/
00247 /*      * Standard errors of the coefficients*/
00248 /*      * 95% confidence bounds for coefficients*/
00249 /* */
00250 /*  Note:*/
00251 /*      The observations are lists starting with the dependent variable y*/
00252 /*      and then the values of the independent variables (x1, x2, ...):*/
00253 /* */
00254 /*      mv-ols [list y x's] [list y x's] ...*/
00255 /* */
00256 ret  ::math::statistics::mv-ols (type data) {
00257     set newdata {}
00258     foreach pt $data {
00259         lappend newdata 1 $pt
00260     }
00261     return [mv-wls $newdata]
00262 }
00263