%% LyX 1.6.5 created this file. For more info, see http://www.lyx.org/.
%% Do not edit unless you really know what you are doing.
\documentclass[12pt,english]{article}
\usepackage[T1]{fontenc}
\usepackage[latin9]{inputenc}
\usepackage[letterpaper]{geometry}
\geometry{verbose,tmargin=1in,bmargin=1in,lmargin=1in,rmargin=1in}
\usepackage{float}
\makeatletter
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% LyX specific LaTeX commands.
\newcommand{\noun}[1]{\textsc{#1}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% Textclass specific LaTeX commands.
\usepackage{Sweave}
\newcommand{\Rcode}[1]{{\texttt{#1}}}
\newcommand{\Robject}[1]{{\texttt{#1}}}
\newcommand{\Rcommand}[1]{{\texttt{#1}}}
\newcommand{\Rfunction}[1]{{\texttt{#1}}}
\newcommand{\Rfunarg}[1]{{\textit{#1}}}
\newcommand{\Rpackage}[1]{{\textit{#1}}}
\newcommand{\Rmethod}[1]{{\textit{#1}}}
\newcommand{\Rclass}[1]{{\textit{#1}}}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% User specified LaTeX commands.
% packages used
\usepackage{amssymb,latexsym}
\usepackage[mathscr]{eucal}
\usepackage{amsthm}
\usepackage{amsmath}
\usepackage{amsfonts}
\usepackage{layout}
\usepackage{multicol}
% Fancy Expectation Notation
\renewcommand{\P}{\mathrm{I\! P}}
\newcommand{\vP}{|\mathrm{I\! P}|}
\newcommand{\nvP}{\|\mathrm{I\! P}\|}
\newcommand{\E}{\mathrm{I\! E}}
\newcommand{\R}{\mathrm{I\! R}}
\newcommand{\vE}{|\mathrm{I\! E}|}
\newcommand{\nvE}{\|\mathrm{I\! E}\|}
\renewcommand{\d}{\mathrm{d}}
\newcommand{\ybar}{\overline{y}}
\newcommand{\xbar}{\overline{x}}
\newcommand{\Xbar}{\overline{X}}
\newcommand{\Ybar}{\overline{Y}}
\newcommand{\hs}{\hspace*{20pt}}
% Greek Letters (new way)
\newcommand{\ga}{\alpha}
\newcommand{\gb}{\beta}
\renewcommand{\gg}{\gamma} % old use was >>
\newcommand{\gG}{\Gamma}
\newcommand{\gd}{\delta}
\newcommand{\gep}{\epsilon} % use \geq for >=
\newcommand{\gk}{\kappa}
\newcommand{\gf}{\varphi}
\newcommand{\gl}{\lambda}
\newcommand{\gm}{\mu}
\newcommand{\gn}{\nu}
\newcommand{\go}{\omega}
\newcommand{\gp}{\pi}
\newcommand{\gs}{\sigma}
\newcommand{\gth}{\theta}
\newcommand{\gO}{\Omega}
\newcommand{\gP}{\Pi}
\newcommand{\cg}{\color[rgb]{0,0.5,0}}
\newcommand{\me}{\mathrm{e}}
\makeatother
\usepackage{babel}
\begin{document}
<>=
set.seed(42)
@
\begin{center}
STAT 3743 $\bullet$ PROBABILITY \& STATISTICS $\bullet$ FALL 2010
$\bullet$ KERNS
\par\end{center}
\begin{center}
\textsf{R} Lab \#1
\par\end{center}
\begin{flushright}
Name: \underbar{\makebox[2in]{ANSWER KEY}}
\par\end{flushright}
\vspace{0.1in}
\begin{quote}
\textbf{\noun{\footnotesize Note:}}{\footnotesize{} the questions are
randomly generated so these may (not) exactly match those on your
paper. The answers below are for }\emph{\footnotesize these}{\footnotesize{}
and if you have trouble seeing the connection between these and those,
ask me.}
\end{quote}
\begin{description}
\item [{Directions:}] for each of the following variables,
\begin{enumerate}
\item Construct several visual displays (appropriate for the data type).
Sketch by hand on a piece of paper.
\item Comment on what you have learned about the data (I'm thinking CUSS).
\item Comment on which displays capture those features, and which ones do
not.
\end{enumerate}
\end{description}
\vspace{0.1in}
<>=
tmp1 <- c("airquality", "airquality", "airquality", "airquality", "attenu", "attenu", "attenu", "attitude", "attitude", "attitude", "attitude", "attitude", "attitude", "attitude", "beaver1", "beaver2", "BOD", "cars", "cars", "cars", "chickwts", "faithful", "faithful", "Formaldehyde", "Formaldehyde", "infert", "infert", "infert", "InsectSprays", "iris", "iris", "iris", "iris", "LifeCycleSavings", "LifeCycleSavings", "LifeCycleSavings", "LifeCycleSavings", "LifeCycleSavings", "longley", "longley", "longley", "longley", "longley", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "mtcars", "OrchardSprays", "PlantGrowth", "pressure", "pressure", "Puromycin", "Puromycin", "quakes", "quakes", "quakes", "rock", "rock", "rock", "rock", "sleep", "stackloss", "stackloss", "stackloss", "swiss", "swiss", "swiss", "swiss", "swiss", "swiss", "ToothGrowth", "trees", "trees", "trees", "USArrests", "USArrests", "USArrests", "USArrests", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "USJudgeRatings", "warpbreaks", "women", "women" )
tmp2 <- c("Ozone", "Solar.R", "Temp", "Wind", "accel", "dist", "mag", "advance", "complaints", "critical", "learning", "privileges", "raises", "rating", "temp", "temp", "demand", "dist", "speed", "speed", "weight", "eruptions", "waiting", "carb", "optden", "age", "education", "parity", "count", "Petal.Length", "Petal.Width", "Sepal.Length", "Sepal.Width", "ddpi", "dpi", "pop15", "pop75", "sr", "Armed.Forces", "GNP", "GNP.deflator", "Population", "Unemployed", "am", "carb", "cyl", "disp", "drat", "gear", "hp", "mpg", "qsec", "vs", "vs", "wt", "decrease", "weight", "pressure", "temperature", "conc", "rate", "depth", "mag", "stations", "area", "peri", "perm", "shape", "extra", "Acid.Conc", "Air.Flow", "Water.Temp", "Agriculture", "Catholic", "Education", "Examination", "Fertility", "Infant.Mortality", "len", "Girth", "Height", "Volume", "Assault", "Murder", "Rape", "UrbanPop", "CFMG", "CONT", "DECI", "DILG", "DMNR", "FAMI", "INTG", "ORAL", "PHYS", "PREP", "RTEN", "WRIT", "breaks", "height", "weight" )
runif(7)
ind <- sample(length(tmp1), size = 5)
ind <- c(24,29,20,21,27)
@
\begin{description}
\item [{Here~are~the~variables:}]~\end{description}
\begin{itemize}
\item \texttt{\Sexpr{tmp1[ind[1]]}\$\Sexpr{tmp2[ind[1]]}} (see \texttt{?\Sexpr{tmp1[ind[1]]}})
\item \texttt{\Sexpr{tmp1[ind[2]]}\$\Sexpr{tmp2[ind[2]]}}
\item \texttt{\Sexpr{tmp1[ind[3]]}\$\Sexpr{tmp2[ind[3]]}}
\item \texttt{\Sexpr{tmp1[ind[4]]}\$\Sexpr{tmp2[ind[4]]}}
\item \texttt{\Sexpr{tmp1[ind[5]]}\$\Sexpr{tmp2[ind[5]]}}
\end{itemize}
\vspace{0.5in}
\noindent \textbf{(Selected) Solutions:}
\paragraph*{For variable \texttt{\Sexpr{tmp1[ind[1]]}\$\Sexpr{tmp2[ind[1]]}}:}
<>=
D <- get(tmp1[ind[1]])
v <- D[, tmp2[ind[1]]]
@
We read in the help file that the data are from a chemical experiment
to prepare a standard curve for the determination of formaldehyde
by the addition of chromatropic acid and concentrated sulphuric acid
and the reading of the resulting purple color on a spectrophotometer.
The variable \texttt{\Sexpr{tmp2[ind[1]]}} is thus a quantitative,
continuous variable, Carbohydrate (ml). We can even take a look at
these data directly, since the sample size is so small.
<<>>=
Formaldehyde
@
The types of plots appropriate for quantitative data include histograms,
strip charts, stemplots, boxplots, \ldots{}, the list continues.
Which one is appropriate for these data? Let's look at a stemplot.
Rather than typing that whole big name every time, let's save the
data in a shorter variable name.
<>=
v <- Formaldehyde$carb
@
Now on to the stemplot.
<<>>=
library(aplpack)
stem.leaf(v)
@
From this we see that the sample median Carbohydrate level is approximately
0.5 (in fact, we can read it from the original data to be 0.55). The
range of the data (typically a bad measure, but not so bad for this
small data set) is 0.1 to 0.9, and it's hard to see any reliable shape.
A good way to get several descriptive statistics on a variable like
this is with the \texttt{summary} command.
<<>>=
summary(v)
@
There we get two measures of center and the quartiles.
Now let's take a look at another visual display. This is a small data
set, so let's try a stripchart.
<>=
stripchart(v)
@
%
\begin{figure}[H]
\begin{centering}
<>=
par(cex=0.5)
stripchart(v)
@
\par\end{centering}
\caption{Stripchart of \texttt{\Sexpr{tmp1[ind[1]]}\$\Sexpr{tmp2[ind[1]]}}}
\end{figure}
This gives us a good picture of the data; they spread from just above
zero to almost 0.10, and they are nearly uniformly distributed throughout
that range. The center looks to be around 0.55. The plot gives us
our first tangible idea about shape: these data are not peaked in
the middle with heavy tails, rather, they are flat all over with thin
tails. We would suspect that a distribution like this would be platykurtic.
We're here, so let's check.
<<>>=
library(e1071)
kurtosis(v)
@
Yes, it is just as we suspected. By the way, the data look ever so
slightly right-skewed. Let's check that as well.
<>=
skewness(v) # the e1071 package is already loaded
@
We are batting one-thousand, so far. There do not seem to be any unusual
features to this data set, which should not be a surprise given the
data set's size.
Now that we've seen a couple visual displays, let's try another (a
boxplot).
<>=
boxplot(v, horizontal = TRUE)
@
The boxplot is shown below. It shows a center of around 0.55 (located
at the median), the IQR is approximately $0.7-0.3=0.4$, and there
are no extreme values indicated. The boxplot hints at an ever-so-slight
right skewness. We should try to bear in mind, however, that its a
little bit silly to draw a boxplot for only six observations. We really
don't need a summary display; we would be just as well to look at
the individual data values with a stripchart.
%
\begin{figure}[H]
\begin{centering}
<>=
par(cex=0.5)
boxplot(v, horizontal = TRUE)
@
\par\end{centering}
\caption{Boxplot of \texttt{\Sexpr{tmp1[ind[1]]}\$\Sexpr{tmp2[ind[1]]}}}
\end{figure}
This data set is a good example of what to do when we have a small
data set. Boxplots (and histograms, as well) serve to summarize a
data set, and are not so helpful for small data. We can estimate things
like center and spread, but it is difficult to really go to the bank
with those estimates, because we really do not have a lot of information
from the population. The same remarks apply to judgements of shape:
they are tentative, at best.
For these data, we would say that the boxplot is not so useful, but
the stripchart and stemplot do a pretty good job. If I had to pick
a best one, I would say the strip chart.
\paragraph*{For variable \texttt{\Sexpr{tmp1[ind[2]]}\$\Sexpr{tmp2[ind[2]]}}:}
<>=
D <- get(tmp1[ind[2]])
v <- D[, tmp2[ind[2]]]
@
We read in the help file that the data are counts of insects in agricultural
experimental units treated with different insecticides. Let's save
ourselves some typing right now.
<>=
v <- InsectSprays$count
@
The sample size is
<<>>=
length(v)
v
@
Here we go; now we have a reasonably sized data set. These are counts,
so we are looking at quantitative discrete data. Let's try a stemplot.
Now on to the stemplot.
<<>>=
library(aplpack)
stem.leaf(v)
@
Nice. What a lot of information! The median is shown to be 9 insects,
the counts range from 0 to 26, and the shape is clearly right skewed.
There are no extreme values identified in the stemplot, but there
look to be two {}``humps'' on either side of the median, and there
may be another hump up higher, around 20. But this makes sense: we
already knew that the data were about different groups of insects
treated with different insecticides, so what we may be seeing is different
performance (or kill rates) for the different insecticides. See how
the depths drop off in an unbalanced way on either side of the median?
Such behavior is common with skewed data.
Let's try something else, maybe a boxplot.
%
\begin{figure}[H]
\begin{centering}
<>=
par(cex=0.5)
boxplot(v, horizontal = TRUE)
@
\par\end{centering}
\caption{Boxplot of \texttt{\Sexpr{tmp1[ind[2]]}\$\Sexpr{tmp2[ind[2]]}}}
\end{figure}
<>=
boxplot(v, horizontal = TRUE)
@
Well, the boxplot shows the center as before, we can read a measure
of spread (approximately the $IQR$) to be around 12, and the longer
whisker on the right side is suggestive of the right skewness. Note,
however, that we lost all information about the multiple humps. Boxplots
are blind to such things. As a redeeming feature we can also see that
there are no extreme values in this data set, just like the stemplot.
The boxplot has definitely summarized the data, perhaps too much in
this case.
We haven't tried a histogram, yet, let's look at one of those.
%
\begin{figure}[H]
\begin{centering}
<>=
par(cex=0.5)
hist(v)
@
\par\end{centering}
\caption{Boxplot of \texttt{\Sexpr{tmp1[ind[2]]}\$\Sexpr{tmp2[ind[2]]}}}
\end{figure}
<>=
hist(v)
@
The default histogram is OK. We see the right skewness, and there
is even a hint of maybe multiple humps. We can't tell the actual highest
value, though; for all we know, the maximum value could even have
been 30! All in all, the default histogram is so-so in its performance
with these data. Another thing we could try is to fiddle with the
number of bins. Different bins means different histograms. Here is
a histogram with (approximately) 15 bins.
%
\begin{figure}[H]
\begin{centering}
<>=
par(cex=0.5)
hist(v, breaks = 15)
@
\par\end{centering}
\caption{Boxplot of \texttt{\Sexpr{tmp1[ind[2]]}\$\Sexpr{tmp2[ind[2]]}}}
\end{figure}
<>=
hist(v, breaks = 15)
@
This is definitely better. Now we can tell that the max is 26, and
the two humps are more prominent. For these data, however, even this
histogram does not hold a candle to the stemplot. Stemplot is a winner.
\paragraph*{For variable \texttt{\Sexpr{tmp1[ind[5]]}\$\Sexpr{tmp2[ind[5]]}}:}
<>=
D <- get(tmp1[ind[5]])
v <- D[, tmp2[ind[5]]]
@
The help file says that the data come from a matched case-control
study dating from before the availability of conditional logistic
regression. The education variable is shown to be a categorical (qualitative)
variable, it is a factor, and the levels are intuitively ordered (but
note they are not represented internally as an ordered factor). We
can see the first part of the data with a command like this:
<>=
v <- infert$education
@
<<>>=
v[1:5]
@
The sample size is \Sexpr{length(v)}, which is the largest we have
yet seen. We may quickly make a frequency table like this:
<<>>=
table(v)
@
There are three categories; the majority of observations are in the
second category while there are relatively few observations in the
first category. Let's take a look at some visual displays of the data.
<>=
barplot(table(v))
@
%
\begin{figure}[H]
\begin{centering}
<>=
par(cex=0.5)
barplot(table(v))
@
\par\end{centering}
\caption{Bar graph of \texttt{\Sexpr{tmp1[ind[5]]}\$\Sexpr{tmp2[ind[5]]}}}
\end{figure}
<>=
dotchart(table(v))
@
%
\begin{figure}[H]
\begin{centering}
<>=
par(cex=0.5)
dotchart(table(v))
@
\par\end{centering}
\caption{Cleveland dot chart of \texttt{\Sexpr{tmp1[ind[5]]}\$\Sexpr{tmp2[ind[5]]}}}
\end{figure}
<>=
library(qcc)
pareto.chart(table(v))
@
%
\begin{figure}[H]
\begin{centering}
<>=
par(cex=0.5)
library(qcc)
pareto.chart(table(v))
@
\par\end{centering}
\caption{Pareto diagram of \texttt{\Sexpr{tmp1[ind[5]]}\$\Sexpr{tmp2[ind[5]]}}}
\end{figure}
In all honesty, all three graphs convey essentially the same information
for these data. There are only three categories, and the relationship
between them is reasonably clear. There are no clear winners. It is
a good thing when multiple visual displays suggest the same information,
or better put, it is BAD when radically different messages are conveyed
depending on the display used. We should count our lucky stars in
this case and rest for a moment, rest with the knowledge that the
next data set we encounter will likely not be so simple.
\end{document}