Tài liệu hạn chế xem trước, để xem đầy đủ mời bạn chọn Tải xuống
1
/ 74 trang
THÔNG TIN TÀI LIỆU
Thông tin cơ bản
Định dạng
Số trang
74
Dung lượng
395,4 KB
Nội dung
chapter.break Page 594 The Bernoulli Distribution # bernoulli($x, $p) returns 1-$p if $x is 0, $p if $x is 1, 0 otherwise. # sub bernoulli { my ($x, $p) = @_; return unless $p > 0 && $p < 1; return $x ? ( ($x == 1) ? $p : 0 ) : (1 - $p); } sub bernoulli_expected { $_[0] } sub bernoulli_variance { $_[0] * (1 - $_[0]) } The Beta Distribution # beta( $x, $a, $b ) returns the Beta distribution for $x given the # Beta parameters $a and $b. # sub beta { my ($x, $a, $b) = @_; return unless $a > 0 and $b > 0; return factorial ($a + $b - 1) / factorial ($a - 1) / factorial ($b - 1) * ($x ** ($a - 1)) * ((1 - $x) ** ($b - 1)); } sub beta_expected { $_[0] / ($_[0] + $_[1]) } sub beta_variance { ($_[0] * $_[1]) / (($_[0] + $_[1]) ** 2) / ($_[0] + $_[1] + 1) } The Binomial Distribution # binomial($x, $n, $p); # binomial_expected($n, $p); # sub binomial { my ($x, $n, $p) = @_; return unless $x >= 0 && $x == int $x && $n > 0 && $n == int $n && $p > 0 && $p < 1; return factorial($n) / factorial($x) / factorial($n - $x) * ($p ** $x) * ((1 - $p) ** ($n - $x)); } sub binomial_expected { $_[0] * $_[1] } sub binomial_variance { $_[0] * $_[1] * (1 - $_[1]) } The Cauchy Distribution use constant pi_inverse => 0.25 / atan2(1, 1); sub cauchy { my ($x, $a, $b) = @_; return unless $a > 0; return pi_inverse * $a / (($a ** 2) + (($x - $b) ** 2)); } sub cauchy_expected { $_[1] } Page 595 The Chi Square Distribution sub chi_square { my ($x, $n) = @_; return 0 unless $x > 0; return 1 / factorial($n/2 - 1) * (2 ** (-$n / 2)) * ($x ** (($n / 2) - 1)) * exp(-$x / 2); } sub chi_square_expected { $_[0] } sub chi_square_variance { 2 * $_[0] } The Erlang Distribution sub erlang { my ($x, $a, $n) = @_; return unless $a > 0 && $n > 0 && $n == int($n); return 0 unless $x > 0; return ($a ** $n) * ($x ** ($n-1)) * exp(-$a * $x) / factorial($n-1); } sub erlang_expected { $_[1] / $_[0] } sub erlang_variance { $_[1] / ($_[0] ** 2) } The Exponential Distribution sub exponential { my ($x, $a) = @_; return unless $a > 0; return 0 unless $x > 0; return $a * exp(-$a * $x); } sub exponential_expected { 1 / $_[0] } sub exponential_variance { 1 / ($_[0] ** 2) } The Gamma Distribution sub gamma { my ($x, $a, $b) = @_; return unless $a > -1 && $b > 0; return 0 unless $x > 0; return ($x ** $a) * exp(-$x / $b) / factorial($a) / ($b ** ($a + 1)); } sub gamma_expected { ($_[0] + 1) * $_[1] } sub gamma_variance { ($_[0] + 1) * ($_[1] ** 2) } The Gaussian (Normal) Distribution use constant two_pi_sqrt_inverse => 1 / sqrt(8 * atan2(1, 1)); sub gaussian { my ($x, $mean, $variance) = @_; return two_pi_sqrt_inverse * Page 596 exp( -( ($x - $mean) ** 2 ) / (2 * $variance) ) / sqrt $variance; } We don't provide subroutines to compute the expected value and variance because those are the parameters that define the Gaussian. (The mean and expected value are synonymous in the Gaussian distribution. )break The Geometric Distribution sub geometric { my ($x, $p) = @_; return unless $p > 0 && $p < 1; return 0 unless $x == int($x); return $p * ((1- $p) ** ($x - 1)) ; } sub geometric_expected { 1 / $_[0] } sub geometric_variance { (1 - $_[0]) / ($_[0] ** 2) } The Hypergeometric Distribution sub hypergeometric { my ($x, $k, $m, $n) = @_; return unless $m > 0 && $m == int($m) && $n > 0 && $n == int($n) && $k > 0 && $k <= $m + $n; return 0 unless $x <= $k && $x == int($x); return choose($m, $x) * choose($n, $k - $x) / choose($m + $n, $k); } sub hypergeometric_expected { $_[0] * $_[1] / ($_[1] + $_[2]) } sub hypergeometric_variance { my ($k, $m, $n) = @_; return $m * $n * $k * ($m + $n - $k) / (($m + $n) ** 2) / ($m + $n - 1); } The Laplace Distribution # laplace($x, $a, $b) sub laplace { return unless $_[1] > 0; return $_[1] / 2 * exp( -$_[1] * abs($_[0] - $_[2]) ); } sub laplace_expected { $_[1] } sub laplace_variance { 2 / ($_[0] ** 2) } Page 597 The Log Normal Distribution use constant sqrt_twopi => sqrt(8 * atan2(1, 1)); sub lognormal { my ($x, $a, $b, $std) = @_; return unless $std > 0; return 0 unless $x > $a; return (exp -(((log($x - $a) - $b) ** 2) / (2 * ($std ** 2)))) / (sqrt_twopi * $std * ($x- $a)); } sub lognormal_expected { $_[0] + exp($_[1] + 0.5 * ($_[2] ** 2)) } sub lognormal_variance { exp(2 * $_[1] + ($_[2] ** 2)) * (exp($_[2] ** 2) - 1) } The Maxwell Distribution use constant pi => 4 * atan2(1, 1); sub maxwell { my ($x, $a) = @_; return unless $a > 0; return 0 unless $x > 0; return sqrt(2 / pi) * ($a ** 3) * ($x ** 2) * exp($a * $a * $x * $x / -2); } sub maxwell_expected { sqrt( 8/pi ) / $_[0] } sub maxwell_variance { (3 - 8/pi) / ($_[0] ** 2) } The Pascal Distribution sub pascal { my ($x, $n, $p) = @_; return unless $p > 0 && $p < 1 && $n > 0 && $n == int($n); return 0 unless $x >= $n && $x == int($x); return choose($x - 1, $n - 1) * ($p ** $n) * ((1 - $p) ** ($x - $n)); } sub pascal_expected { $_[0] / $_[1] } sub pascal_variance { $_[0] * (1 - $_[1]) / ($_[1] ** 2) } The Poisson Distribution sub poisson { my ($x, $a) = @_; return unless $a >= 0 && $x >= 0 && $x == int($x); return ($a ** $x) * exp(-$a) / factorial($x); } sub poisson_expected { $_[0] } sub poisson_variance { $_[0] } Page 598 The Rayleigh Distribution use constant pi => 4 * atan2(1, 1); sub rayleigh { my ($x, $a) = @_; return unless $a > 0; return 0 unless $x > 0; return ($a ** 2) * $x * exp( -($a ** 2) * ($x ** 2) / 2 ); } sub rayleigh_expected { sqrt(pi / 2) / $_[0] } sub rayleigh_variance { (2 - pi / 2) / ($_[0] ** 2) } The Uniform Distribution The Uniform distribution is constant over the interval from $a to $b.break sub uniform { my ($x, $a, $b) = @_; return unless $b > $a; return 0 unless $x > $a && $x < $b; return 1 / ($b - $a) ; } sub uniform_expected { ($_[0] + $_[1]) / 2 } sub uniform_variance { (($_[1] - $_[0]) ** 2) / 12 } Page 599 15— Statistics There are three kinds of lies: lies, damned lies, and statistics. —Benjamin Disraeli (1804–1881) Statistics is the science of quantifying conjectures. How likely is an event? How much does it depend on other events? Was an event due to chance, or is it attributable to another cause? And for whatever answers you might have for these questions, how confident are you that they're correct? Statistics is not the same as probability, but the two are deeply intertwined and on occasion blend together. The proper distinction between them is this: probability is a mathematical discipline, and probability problems have unique, correct solutions. Statistics is concerned with the application of probability theory to particular real-world phenomena. A more colloquial distinction is that probability deals with small amounts of data, and statistics deals with large amounts. As you saw in the last chapter, probability uses random numbers and random variables to represent individual events. Statistics is about situations: given poll results, or medical studies, or web hits, what can you infer? Probability began with the study of gambling; statistics has a more sober heritage. It arose primarily because of the need to estimate population, trade, and unemployment. In this chapter, we'll begin with some simple statistical measures: mean, median, mode, variance, and standard deviation. Then we'll explore significance tests, which tell you how sure you can be that some phenomenon (say, that programmers produce more lines of code when their boss is on vacation) is due to chance. Finally, we'll tackle correlations: how to establish to what extent something is dependent on something else (say, how height correlates to weight). This chaptercontinue Page 600 skims over much of the material you'll find in a semester-long university course in statistics, so the coverage is necessarily sparse throughout. Some of the tasks described in this chapter are encapsulated in the Statistics:: modules available on CPAN. Colin Kuskie and Jason Lastner's Statistics::Descriptive module provides an object-oriented interface to many of the tasks outlined in the next section, and Jon Orwant's Statistics::ChiSquare performs a particular significance test described later in the chapter. Statistical Measures In the insatiable need to condense and summarize, people sometimes go too far. Consider a plain-looking statement such as ''The average yearly rainfall in Hawaii is 24 inches." What does this mean, exactly? Is that the average over 10 years? A hundred? Does it always rain about 24 inches per year, or are some years extremely rainy and others dry? Does it rain equally over every month, or are some months wetter than others? Maybe all 24 inches fall in March. Maybe it never rains at all in Hawaii except for one Really Wet Day a long time ago. The answer to our dilemma is obvious: lots of equations and jargon. Let's start with the three distinct definitions of "average": the mean, median, and mode. The Mean When most people use the word "average," they mean the mean. To compute it, you sum all of your data and divide by the number of elements. Let's say our data is from an American football team that has scored the following number of points in sixteen games: @points = (10, 10, 31, 28, 46, 22, 27, 28, 42, 31, 8, 27, 45, 34, 6, 23); The mean is easy to compute: # $mean = mean(\@array) computes the mean of an array of numbers. # sub mean { my ($arrayref) = @_; my $result; foreach (@$arrayref) { $result += $_ } return $result / @$arrayref; } When we call this subroutine as mean \@points or mean [10, 10, 31, 28, 46, 22, 27, 28, 42, 31, 8, 27, 45, 34, 6, 23], the answer 26.125 is returned. The Statistics::Descriptive module lets you compute the mean of a data set after you create a new Statistics::Descriptive object:break Page 601 #!/usr/bin/perl use Statistics::Descriptive; $stat = Statistics::Descriptive::Full->new(); $stat->add_data(1 100); $mean = $stat->mean(); print $mean; Computing a mean with Statistics::Descriptive is substantially slower (more than 10 times) than our hand-coded subroutine, mostly because of the overhead of creating the object. If you're going to be computing your mean only once, go with the subroutine. But if you want to create your data set, compute the mean, add some more data, compute the mean again, and so on, storing your data in a Statistics::Descriptive object will be worthwhile. One might decide that the weighted mean is more important than the mean. Games early in the season don't mean as much as later games, so perhaps we'd like to have the games count in proportion to their order in the array: @weights = (1 16). We can't just multiply each score by these weights, however, because we'll end up with a huge score—226.8125 to be exact. What we want to do is normalize the weights so that they sum to one but retain the same ratios to one another. To normalize our data, the normalize() subroutine divides every weight by the sum of all the weights: 136 in this case.break @points = (10, 10, 31, 28, 46, 22, 27, 28, 42, 31, 8, 27, 45, 34, 6, 23); @weights = (1 16); @normed_weights = normalize(\@weights); # Divide each weight by 136. print "Mean weighted score: ", weighted_average(\@points, \@normed_weights); # @norms = normalize(\@array) stores a normalized version of @array # in @norms. sub normalize { my ($arrayref) = @_; my ($total, @result); foreach (@$arrayref) { $total += $_ } foreach (@$arrayref) { push(@result, $_ / $total) } return @result; } sub weighted_average { my ($arrayref, $weightref) = @_; my ($result, $i); for ($i = 0; $i < @$arrayref; $i++) { $result += $arrayref->[$i] * $weightref->[$i]; } return $result; } Page 602 This yields a smidgen over 26.68—slightly more than the unweighted score of 26.125. That tells us that our team improved a little over the course of the season, but not much. The Median A football team can't score 26.125 or 26.68 points, of course. You might want to know the median score: the element in the middle of the data set. If the data set has five elements, the median is the third largest (and also the third smallest). That might be far away from the mean: consider a data set such as @array = (9, 1, 10003, 10004, 10002); the mean is 6003.8, but the median is 10,002, the middle value of the sorted array. If your data set has an even number of elements, there are two equally valid definitions of the median. The first is what we'll call the mean median—the middlemost value if there are an odd number of elements, or the average of the two middlemost values otherwise: # $median = mean_median(\@array) computes the mean median of an array # of numbers. # sub mean_median { my $arrayref = shift; my @array = sort ($a <=> $b} @$arrayref; if (@array % 2) { return $array[@array/2]; } else { return ($array[@array/2-1] + $array[@array/2]) / 2; } } You can also write the median function as the following one-liner, which is 12% faster because the temporary variable $arrayref is never created: # $median = median(\@array) computes the odd median of an array of # numbers. # sub median { $_[0]->[ @{$_[0]} / 2 ] } Sometimes, you want the median to be an actual member of the data set. In these cases, the odd median is used. If there is an odd number of elements, the middlemost value is used, as you would expect. If there is an even number of elements, there are two middlemost values, and the one with an odd index is chosen. Since statistics is closer to mathematics than computer science, their arrays start at 1 instead of 0. Computing the odd median of an array is fast when you do it like this:break # $om = odd_median(\@array) computes the odd median of an array of # numbers. # sub odd_median { Page 603 my $arrayref = shift; my @array = sort @$arrayref; return $array[(@array - (0,0,1,0)[@array & 3]) / 2]; } This is a curiously complex bit of code that manages to compute the odd median efficiently—even though the choice of element depends on how many elements @array contains, we don't need an if statement. @array must fulfill one of three conditions: an odd number of elements (in which case @array & 3 will either be 1 or 3); an even number of elements divisible by 4 (in which case @array & 3 will be 0); or an even number of elements not divisible by 4 (in which case @array & 3 will be 2). Only in the last case will $array[@array / 2] not be the odd median; in this case we want $array[(@array - 1) / 2] instead. The bizarre construct (0,0,1,0) [@array & 3] yields whatever must be subtracted from @array before dividing in half; 0 most of the time, and 1 when the number of elements in @array is even but not divisible by 4. Additional techniques for finding medians and the related quantities quartiles and percentiles can be found in Chapter 4, Sorting. The Mode The mode is the most common value. For the data set @array = (1, 2, 3, 4, 5, 1000, 1000) the mode is 1000 because it appears twice. (The mean is 287.86 and the median is 4.) If there are two or more equally common elements, there are two options: declare that there is no mode (that is, return undef), or return the median of the modes. The following subroutine does the latter:break # $mode = mode(\@array) computes the mode of an array of numbers. # sub mode { my $arrayref = shift; my (%count, @result), # Use the %count hash to store how often each element occurs foreach (@$arrayref) { $count{$_}++ } # Sort the elements according to how often they occur, # and loop through the sorted list, keeping the modes. foreach (sort { $count{$b} <=> $count{$a} } keys %count) { last if @result && $count{$_} != $count{$result[0]}; push(@result, $_); } # Uncomment the following line to return undef for nonunique modes. # return undef if @result > 1; Page 604 # Return the odd median of the modes. return odd_median \@result; # odd_median() is defined earlier. } Our football team had eight scores that occurred once and four scores that occurred twice: 10, 27, 28, and 31, so the mode is the third element, 28, as mode(\@points) tells us. Standard Deviation The standard deviation is a measure of how "spread out" a data set is. If you score 90 on a test, and the class mean was 75, that might be great—or it might merely be good. If nearly everyone in the class scored within five points of 75, you did great. But if one quarter of the class scored 90 or higher, your score is no longer so impressive. The standard deviation tells you how far away numbers are from their mean. Statistics textbooks fall into one of two categories: those that use fictional test scores to demonstrate the standard deviation and those that use heights or weights instead. We decided to conduct our own experiment. A handful of 50 pennies (our "sample," in statistics lingo) was dropped onto the center of a bed, and their distance (along the long axis of the bed) was measured. The result is shown in Figure 15-1. One penny fell 25 centimeters to the left; another fell 26 centimeters to the right. More than half the pennies fell within four centimeters of the center. The mean of our data is 0.38, just to the right of center. The mean median of our data is 2; so is the odd median. We can say that there's no mode because five pennies fell three centimeters to the right and five pennies fell three centimeters to the left, or we can say that the mode is 0, because (-3 + 3)/2 is 0. It would have been nice if this data set looked more like the Gaussian curve shown in the previous chapter, with the highest number falling at 0. However, reality is not so forgiving, and a cardinal tenet of statistics is that you don't get to roll the dice twice. Now let's calculate the standard deviation. The standard deviation σ of a data set is: This is what we'll use to estimate how spread out our data is; in the next section, we'll see a slightly different formulation of the standard deviation that handles probability distributions.break Page 605 [...]... web hits with store sales, you look at the number of hits and sales total every month for five months:break Page 622 @hits = (2378, 4024, 96 96, 7314, 7710); @sales = (310 .94 , 315.88, 514.15, 500.18, 610.37); When we compute the covariance with covariance(@hits, \@sales) we get 2 691 24. 594 4 The maximum possible covariance is the product of the standard deviations of the data sets: 312 696 .05 094 3578 So... [\&f, \&g], [3, 4, -2] ); foreach $row (@jacobian) { for ($column = 0; $column < @$row; $column++) { print $row->[$column], " "; } print "\n"; } This prints: 11.0000000 296 0 59 -2 16 266666668.176214 0.25 11 .99 999 98667732 -8 -6 12.00000004440 89 As you would expect, not all the numbers are exact Remember, this is a numerical method, so the solution is only approximate The actual Jacobian is: As you can see,... * $suml) / @$arraylref; return ($b, $a); } When we apply best_line() to the points shown in Figure 15-3, we get 54 .99 5 296 3760 691 + 2.01701164212454x That is, our best estimate of the relation between sales and hits is that sales = 2.01701164212454 × the number of hits + 54 99 5 296 3760 691 The line is graphed in Figure 15-4.break Page 624 Figure 15-4 The best-fit line for the points in Figure 15-3 How... witnessed.break Page 611 Table 15-1 Probabilities Associated with Choices Number of left clicks, k Probability of exactly k left clicks Probability of at least k left clicks 8 1/256 1/256 = 0.00 39 7 8/256 9/ 256 = 0.0352 6 28/256 37/256 = 0.1445 5 56/256 93 /256 = 0.3633 4 70/256 163/256 = 0.6367 3 56/256 2 19/ 256 = 0.8555 2 28/256 247/256 = 0 .96 48 1 8/256 255/256 = 0 .99 61 0 1/256 256/256 = 1.0000 Given eight successive... Recipes in C (Press et al., 199 2) Computing the Jacobian The derivative tells you how a function changes with respect to one variable If you have a function in three variables and you want to express how that function changes with respect to each, you compute the gradient of the function, ∇ f, which you can think of as an array with three elements: the derivatives of the function with respect to each variable... 2) ); } Our pennies have a standard deviation slightly more than 5.124 For any data set with a Gaussian distribution, about 68% of the elements will be within one standard deviation of the mean, and approximately 95 % of the elements will be within two standard deviations So we expect 95 .50 ≈ 48 pennies to fall within two standard deviations; that is, between -10 centimeters and 10 centimeters of the... Charles: Edgar: Inigo: 77 and D (sd: B (sd: C (sd: F (sd: A (sd: F (sd: the standard deviation is 6.66 794 8 594 698 23 -0.7) 0 .9) -0.4) -1.3) 1.2) -1.2) Page 608 Florentine: Barbara: Dominique: Lissajous: Murgatroyd: Geraldo: A F C C C C (sd: 1.0) (sd: -1.2) (sd: 0.4) (sd: -0.3) (sd: 0.0) (sd: -0.3) Hacker: A (sd: 1 .9) The Variance and Standard Deviation of Distributions The variance, denoted σ 2, is the square... We can interpret our 95 % criterion in terms of standard deviations as well as pure probability In data with a Gaussian distribution, we expect 68% of our data to fall within one standard deviation of the mean, corresponding to a threshold of 32: not too good Two standard deviations should contain 98 % of the data, for a threshold of 02: that's too good The 05 level occurs at 1 .96 standard deviations... ($arrayref->[$i] - $mean) / $deviation; } return \@scores; } Here's a Perl program that uses several of the subroutines we've seen in this chapter to grade a set of test results: #!/usr/bin /perl %results = (Arnold => 72, Barbara => 69, Charles => 68, Dominique => 80, Edgar => 85, Florentine => 84, Geraldo => 75, Hacker => 90 , Inigo => 69, Jacqueline => 74, Klee => 83, Lissajous => 75, Murgatroyd => 77);... $sum_of_everything = sum($all); my $sum_of_groups = square_groups($all); my $degrees_of_freedom_within = $num_of_elements - @$all; my $degrees_of_freedom_between = @$all - 1; $sum_of_squares_within = $square_of_everything - $sum_of_groups; my $mean_of_squares_within = $sum_of_squares_within / $degrees_of_freedom_within; my $sum_of_squares_between = $sum_of_groups ($sum_of_everything ** 2)/$num_of_elements; . 0.0352 6 28/256 37/256 = 0.1445 5 56/256 93 /256 = 0.3633 4 70/256 163/256 = 0.6367 3 56/256 2 19/ 256 = 0.8555 2 28/256 247/256 = 0 .96 48 1 8/256 255/256 = 0 .99 61 0 1/256 256/256 = 1.0000 Given eight. "F"; } This displays:break The mean is 77 and the standard deviation is 6.66 794 8 594 698 23. Arnold: D (sd: -0.7) Klee: B (sd: 0 .9) Jacqueline: C (sd: -0.4) Charles: F (sd: -1.3) Edgar: A (sd: 1.2) Inigo:. 5.124. For any data set with a Gaussian distribution, about 68% of the elements will be within one standard deviation of the mean, and approximately 95 % of the elements will be within two standard