%% LyX 1.6.7 created this file. For more info, see http://www.lyx.org/.
%% Do not edit unless you really know what you are doing.
\documentclass[11pt,english]{article}
\usepackage[T1]{fontenc}
\usepackage[latin9]{inputenc}
\usepackage{listings}
\usepackage[letterpaper]{geometry}
\geometry{verbose,tmargin=1in,bmargin=1in,lmargin=1in,rmargin=1in}
\usepackage{float}
\makeatletter
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% LyX specific LaTeX commands.
\newcommand{\noun}[1]{\textsc{#1}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Textclass specific LaTeX commands.
\usepackage{Sweave}
\newcommand{\Rcode}[1]{{\texttt{#1}}}
\newcommand{\Robject}[1]{{\texttt{#1}}}
\newcommand{\Rcommand}[1]{{\texttt{#1}}}
\newcommand{\Rfunction}[1]{{\texttt{#1}}}
\newcommand{\Rfunarg}[1]{{\textit{#1}}}
\newcommand{\Rpackage}[1]{{\textit{#1}}}
\newcommand{\Rmethod}[1]{{\textit{#1}}}
\newcommand{\Rclass}[1]{{\textit{#1}}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% User specified LaTeX commands.
% packages used
\usepackage{amssymb,latexsym}
\usepackage[mathscr]{eucal}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{layout}
\usepackage{multicol}
% Fancy Expectation Notation
\renewcommand{\P}{\mathrm{I\! P}}
\newcommand{\vP}{|\mathrm{I\! P}|}
\newcommand{\nvP}{\|\mathrm{I\! P}\|}
\newcommand{\E}{\mathrm{I\! E}}
\newcommand{\R}{\mathrm{I\! R}}
\newcommand{\vE}{|\mathrm{I\! E}|}
\newcommand{\nvE}{\|\mathrm{I\! E}\|}
\renewcommand{\d}{\mathrm{d}}
\newcommand{\ybar}{\overline{y}}
\newcommand{\xbar}{\overline{x}}
\newcommand{\Xbar}{\overline{X}}
\newcommand{\Ybar}{\overline{Y}}
\newcommand{\hs}{\hspace*{20pt}}
% Greek Letters (new way)
\newcommand{\ga}{\alpha}
\newcommand{\gb}{\beta}
\renewcommand{\gg}{\gamma} % old use was >>
\newcommand{\gG}{\Gamma}
\newcommand{\gd}{\delta}
\newcommand{\gep}{\epsilon} % use \geq for >=
\newcommand{\gk}{\kappa}
\newcommand{\gf}{\varphi}
\newcommand{\gl}{\lambda}
\newcommand{\gm}{\mu}
\newcommand{\gn}{\nu}
\newcommand{\go}{\omega}
\newcommand{\gp}{\pi}
\newcommand{\gs}{\sigma}
\newcommand{\gth}{\theta}
\newcommand{\gO}{\Omega}
\newcommand{\gP}{\Pi}
\newcommand{\cg}{\color[rgb]{0,0.5,0}}
\newcommand{\me}{\mathrm{e}}
\makeatother
\usepackage{babel}
\begin{document}
<>=
set.seed(42)
@
\begin{center}
STAT 3743 $\bullet\ $PROBABILITY \& STATISTICS $\bullet\ $FALL 2010
$\bullet\ $KERNS
\par\end{center}
\begin{center}
Exam I
\par\end{center}
\begin{flushright}
Name: \underbar{\makebox[2in]{ANSWER KEY}}
\par\end{flushright}
{\footnotesize \begin{quote}
\textbf{\noun{\footnotesize Note:}}{\footnotesize{} the questions are
randomly generated so these may (not) exactly match those on your
paper. The answers below are for }\emph{\footnotesize these}{\footnotesize{}
and if you have trouble seeing the connection between these and those,
ask me.}
\end{quote}
}{\footnotesize \par}
\vspace{0.1in}
\textbf{Directions:} SHOW ALL WORK. You may use \textsf{R} for computations,
but no other software (and in particular, not the Internet). If you
use \textsf{R} to calculate something, then hand write the \textsf{R}
code that you typed, together with the numerical answer.
<>=
tmp1 <- c("airquality", "airquality", "airquality", "airquality", "attenu", "attenu", "attenu", "attitude", "attitude", "attitude", "attitude", "attitude", "attitude", "attitude", "beaver1", "beaver2", "BOD", "cars", "cars", "cars", "chickwts", "faithful", "faithful", "Formaldehyde", "Formaldehyde", "infert", "infert", "infert", "InsectSprays", "iris", "iris", "iris", "iris", "LifeCycleSavings", "LifeCycleSavings", "LifeCycleSavings", "LifeCycleSavings", "LifeCycleSavings", "longley", "longley", "longley", "longley", "longley", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "OrchardSprays", "PlantGrowth", "pressure", "pressure", "Puromycin", "Puromycin", "quakes", "quakes", "quakes", "rock", "rock", "rock", "rock", "sleep", "stackloss", "stackloss", "stackloss", "swiss", "swiss", "swiss", "swiss", "swiss", "swiss", "ToothGrowth", "trees", "trees", "trees", "USArrests", "USArrests", "USArrests", "USArrests", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "warpbreaks", "women", "women" )
tmp2 <- c("Ozone", "Solar.R", "Temp", "Wind", "accel", "dist", "mag", "advance", "complaints", "critical", "learning", "privileges", "raises", "rating", "temp", "temp", "demand", "dist", "speed", "speed", "weight", "eruptions", "waiting", "carb", "optden", "age", "education", "parity", "count", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "ddpi", "dpi", "pop15", "pop75", "sr", "Armed.Forces", "GNP", "GNP.deflator", "Population", "Unemployed", "am", "carb", "cyl", "disp", "drat", "gear", "hp", "mpg", "qsec", "vs", "vs", "wt", "decrease", "weight", "pressure", "temperature", "conc", "rate", "depth", "mag", "stations", "area", "peri", "perm", "shape", "extra", "Acid.Conc", "Air.Flow", "Water.Temp", "Agriculture", "Catholic", "Education", "Examination", "Fertility", "Infant.Mortality", "len", "Girth", "Height", "Volume", "Assault", "Murder", "Rape", "UrbanPop", "CFMG", "CONT", "DECI", "DILG", "DMNR", "FAMI", "INTG", "ORAL", "PHYS", "PREP", "RTEN", "WRIT", "breaks", "height", "weight" )
ind <- sample(length(tmp1), size = 5)
@
<>=
data.frame(tmp1, tmp2)[ind,]
@
\begin{enumerate}
\item For this problem we will study the \texttt{\Sexpr{tmp1[ind[1]]}}
data. You can read about them with \texttt{?\Sexpr{tmp1[ind[1]]}}.
In particular, let us focus on the variable \texttt{\Sexpr{tmp2[ind[1]]}}.
\begin{enumerate}
\item First, store the values of \texttt{\Sexpr{tmp2[ind[1]]}} in a vector
\lstinline[basicstyle={\ttfamily},showstringspaces=false]!x!. The
quickest way to do this is
\begin{center}
\texttt{x <- \Sexpr{tmp1[ind[1]]}\$\Sexpr{tmp2[ind[1]]}}
\par\end{center}
<>=
D <- get(tmp1[ind[1]])
x <- D[[tmp2[ind[1]]]]
@
(There isn't anything to write down for this part).
\item Find the IQR of \texttt{\Sexpr{tmp2[ind[1]]}}.
\item Find the Five Number Summary (5NS).
\item Use the 5NS to calculate what the width of a boxplot of \texttt{\Sexpr{tmp2[ind[1]]}}
would be.
\item Compare your answers (b) and (d). Are they the same? If not, are they
close?
\item Make a boxplot of \texttt{\Sexpr{tmp2[ind[1]]}}, and include a
sketch of it in your report.
\item Are there any potential/suspected outliers? If so, list their values.
\emph{Hint:} take a look at \texttt{sort(x)}.
\item Using the rules discussed in class, classify each of the answers to
(g), if any, as \emph{potential} or \emph{suspected} outliers.
\end{enumerate}
\textbf{Solution:}
I will forego showing the first part. We will do the next two parts
at once.
<<>>=
IQR(x)
fivenum(x)
@
The width of the box of a boxplot is the upper hinge minus the lower
hinge.
<<>>=
fivenum(x)[4] - fivenum(x)[2]
@
For these data the $IQR$ and the width of the box are identical.
For many data sets, however, they are off by a little bit.
Below is a boxplot of the data.
%
\begin{figure}[H]
\begin{centering}
<>=
par(cex=0.5)
boxplot(x, horizontal = TRUE)
@
\par\end{centering}
\caption{Boxplot of \texttt{\Sexpr{tmp1[ind[1]]}\$\Sexpr{tmp2[ind[1]]}}}
\end{figure}
We can see from the boxplot that for these data there is one (1) extreme
value, off to the left. The quickest way to get its value is with
the \texttt{boxplot.stats} function.
<<>>=
boxplot.stats(x)$out
@
We could just as easily have sorted the data with \texttt{sort(x)}
and then looked for the smallest data value (the first entry).
We next use the rules from class to see whether the datum is a potential
versus suspected outlier. We do that by calculating the lower and
upper fences. Here is a quick way to do that (or you can do it by
hand).
<<>>=
w <- fivenum(x)[4] - fivenum(x)[2]
fivenum(x)[2]-c(3, 1.5)*w
fivenum(x)[4]+c(1.5, 3)*w
@
Since the datum does not fall outside the outer fence on the left,
it is merely a potential outlier.
\item This problem studies the \texttt{\Sexpr{tmp1[ind[2]]}} data. You
can read about them with \texttt{?\Sexpr{tmp1[ind[2]]}}. Type \texttt{head(\Sexpr{tmp1[ind[2]]})}
at the command prompt for a quick look at the top of the data set.
\begin{enumerate}
\item Identify the data type of each of the variables.
\item Now type \texttt{attach(\Sexpr{tmp1[ind[2]]})} at the command prompt
which will allow you to simply type variable names without the dollar
signs. Try it. Type \texttt{\Sexpr{tmp2[ind[2]]}} at the command
prompt.
\begin{enumerate}
\item Choose an appropriate visual display for \texttt{\Sexpr{tmp2[ind[2]]}}
and sketch the graph (just a sketch). You will want to try several
choices before you decide on an {}``appropriate'' one.
\item Report at least two (2) measures of center for \texttt{\Sexpr{tmp2[ind[2]]}}.
Based on what you know about the data from above, make a decision
about which measure is the better one for these data, and tell me
why.
\item Report at least two (2) measures of spread for \texttt{\Sexpr{tmp2[ind[2]]}}.
Again, based on what you know about the data from above, make a decision
about which measure is the better one for these data, and tell me
why.
\item Report at least two (2) measures of shape for \texttt{\Sexpr{tmp2[ind[2]]}}.
Use the rules-of-thumb we discussed in class to decide if the values
you observed are substantially different from zero. \emph{Hint:} don't
forget \texttt{library(e1071)}.
\end{enumerate}
\item Report any other unusual features of \texttt{\Sexpr{tmp2[ind[2]]}}
that you see.
\end{enumerate}
\noindent \textbf{Solution:}
\begin{enumerate}
\item These data are all quantitative, presumably continuous (or at least
likely should be taken as continuous).
<>=
D <- get(tmp1[ind[2]])
x <- D[[tmp2[ind[2]]]]
@
\item Descriptive statistics
\begin{enumerate}
\item Visual display. We will put down several to give us a feel for the
data.
%
\begin{figure}[H]
\begin{centering}
<>=
par(cex=0.5)
boxplot(x, horizontal = TRUE)
@
\par\end{centering}
\caption{Boxplot of \texttt{\Sexpr{tmp1[ind[2]]}\$\Sexpr{tmp2[ind[2]]}}}
\end{figure}
%
\begin{figure}[H]
\begin{centering}
<>=
par(cex=0.5)
hist(x)
@
\par\end{centering}
\caption{Histogram of \texttt{\Sexpr{tmp1[ind[2]]}\$\Sexpr{tmp2[ind[2]]}}}
\end{figure}
%
\begin{figure}[H]
\begin{centering}
<>=
par(cex=0.5)
stripchart(x, method = "stack")
@
\par\end{centering}
\caption{Stripchart of \texttt{\Sexpr{tmp1[ind[2]]}\$\Sexpr{tmp2[ind[2]]}}}
\end{figure}
<<>>=
library(aplpack)
stem.leaf(x)
@
For these data it looks like the strip chart conveys the most information
about the data (the stemplot does a good job, too). The boxplot really
loses most of the information about the shape and granularity of the
data, while the histogram falls in the middle, better than the boxplot
but not quite as informative as the strip chart.
\item Measures of center.
<<>>=
mean(x)
median(x)
mean(x, trim = 0.05)
@
Since these data had potential outlier(s) we should choose a resistant
measure of center such as the median or trimmed mean.
\item Measures of spread.
<<>>=
sd(x)
IQR(x)
mad(x)
@
Since these data had potential outlier(s) we should choose a resistant
measure of spread such as the $IQR$ or $\mbox{MAD}$.
\item Measures of shape.
<<>>=
library(e1071)
skewness(x)
kurtosis(x)
@
To see if these are relatively large we would calculate
<<>>=
n <- length(x)
sqrt(6/n)*c(2,4)
@
So these data are substantially skewed left but not substantially
kurtic.
\end{enumerate}
\item Anything else noteworthy.
We can see from the boxplot that there are two extreme values to the
left, and visually we can surmise that these are potential outliers
(or we can check by hand). \end{enumerate}
\end{enumerate}
\end{document}