activity=read.csv(unzip('activity.zip')) # unzip and load the file
Data Structure:
str(activity) # View data structure
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
Data Summary:
summary(activity) # View data Summary
## steps date interval
## Min. : 0 2012-10-01: 288 Min. : 0
## 1st Qu.: 0 2012-10-02: 288 1st Qu.: 589
## Median : 0 2012-10-03: 288 Median :1178
## Mean : 37 2012-10-04: 288 Mean :1178
## 3rd Qu.: 12 2012-10-05: 288 3rd Qu.:1766
## Max. :806 2012-10-06: 288 Max. :2355
## NA's :2304 (Other) :15840
totalstepsperday=aggregate(steps~date,activity,sum) # sum total steps over each day
hist(totalstepsperday$steps, # Frequency(histogram) of total steps per day
col="lightblue",
border="blue4",
lty=2,
main="Histogram Of Total Number Of Steps Per Day",
xlab="Total Number Of Steps Per Day")
mtext("(With Missing Values)")
meantotsteps=mean(totalstepsperday$steps) # Average total steps per day
mediantotsteps=median(totalstepsperday$steps)# Median total steps per day
Mean Total Number Of Steps Per Day (with missing values) : 10766.19
Median Total Number Of Steps Per Day (with missing values) : 10765
Generate New Dataframe to track average steps per interval :
avgstepsperinterval=aggregate(steps~interval,activity,mean) # avg step per interval
str(avgstepsperinterval);summary(avgstepsperinterval)
## 'data.frame': 288 obs. of 2 variables:
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
## $ steps : num 1.717 0.3396 0.1321 0.1509 0.0755 ...
## interval steps
## Min. : 0 Min. : 0
## 1st Qu.: 589 1st Qu.: 2
## Median :1178 Median : 34
## Mean :1178 Mean : 37
## 3rd Qu.:1766 3rd Qu.: 53
## Max. :2355 Max. :206
Time Series Plot - Average Steps per 5 min interval Over all Days
par(mar=c(5,6,4,2))
plot(avgstepsperinterval$interval,
avgstepsperinterval$steps,
type="l",
main="Avg Steps Per 5 min Interval Over All Days",
xlab="Time Intervals",
ylab="Avg Steps Per 5 min Interval \nOver All Days",
col="blue")
Interval with Maximum Average Steps :
intmaxsteps=avgstepsperinterval$interval[avgstepsperinterval$steps==max(avgstepsperinterval$steps)]
5 min Interval with Maximum Average Steps : 835
Number of Missing Values :
numissingvalues=nrow(activity[is.na(activity$steps),])
Total Number of Missing Values : 2304
Filling Missing Values (use mean of time interval) :
activitynew=merge(activity,avgstepsperinterval,by="interval") # merge
summary(activitynew) # Review data frame & NA totals
## interval steps.x date steps.y
## Min. : 0 Min. : 0 2012-10-01: 288 Min. : 0
## 1st Qu.: 589 1st Qu.: 0 2012-10-02: 288 1st Qu.: 2
## Median :1178 Median : 0 2012-10-03: 288 Median : 34
## Mean :1178 Mean : 37 2012-10-04: 288 Mean : 37
## 3rd Qu.:1766 3rd Qu.: 12 2012-10-05: 288 3rd Qu.: 53
## Max. :2355 Max. :806 2012-10-06: 288 Max. :206
## NA's :2304 (Other) :15840
# replace NA's with mean of intervals
activitynew$steps.x[is.na(activitynew$steps.x)]=round(activitynew$steps.y[is.na(activitynew$steps.x)],0)
activitynew=activitynew[,c('interval','steps.x','date')] # Drop the merged column
names(activitynew)[2]='steps' # rename steps.x to steps
summary(activitynew) # re-check values for NA's
## interval steps date
## Min. : 0 Min. : 0 2012-10-01: 288
## 1st Qu.: 589 1st Qu.: 0 2012-10-02: 288
## Median :1178 Median : 0 2012-10-03: 288
## Mean :1178 Mean : 37 2012-10-04: 288
## 3rd Qu.:1766 3rd Qu.: 27 2012-10-05: 288
## Max. :2355 Max. :806 2012-10-06: 288
## (Other) :15840
** Note absence of NA’s in the second summary above
Histogram, mean and median of total steps taken per day for New Dataframe equal to activity, but with missing values filled :
newtotalstepsperday=aggregate(steps~date,activitynew,sum) # sum total steps over each day
hist(newtotalstepsperday$steps, # Frequency(histogram) of total steps per day
col="lightgreen",
border="green4",
lty=2,
main="Histogram Of Total Number Of Steps Per Day",
xlab="Total Number Of Steps Per Day")
mtext("(Missing Values Filled)")
newmeantotsteps=mean(newtotalstepsperday$steps) # Average total steps per day
newmediantotsteps=median(newtotalstepsperday$steps)# Median total steps per day
New Mean Total Number Of Steps Per Day (Missing values filled) : 10765.64
New Median Total Number Of Steps Per Day (Missing values filled) : 10762
Impact Of Adding Missing Values :
old=c("mean"=meantotsteps,"median"=mediantotsteps)
new=c("mean"=newmeantotsteps,"median"=newmediantotsteps)
oldnew=data.frame(old,new)
oldnew$diff=(new-old)/old*100
oldnew
## old new diff
## mean 10766 10766 -0.0051
## median 10765 10762 -0.0279
“diff” column in the above dataframe indicates the % difference in the mean and median values from the earlier estimates with missing values and current estimates with missing values filled in. As apparent, there is a very marginal, practically negligible difference between the earlier and current estimates. However, it is quite possible that the method used to fill in the missing values may impact the estimates and can be a separate area to evaluate.
Add a new column to track weekday/weekend :
# Use activitynew dataframe and add a day column for weekdays/weekends
# Use weekday function to identify weekends
activitynew$day[weekdays(as.Date(activitynew$date))%in%c("Sunday","Saturday")]="weekend"
activitynew$day[is.na(activitynew$day)]="weekday" # All other days are weekdays
activitynew$day=as.factor(activitynew$day) # Convert this column to factor
str(activitynew) # Review structure with added column
## 'data.frame': 17568 obs. of 4 variables:
## $ interval: int 0 0 0 0 0 0 0 0 0 0 ...
## $ steps : num 2 0 0 0 0 0 0 0 0 0 ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 54 28 37 55 46 20 47 38 56 ...
## $ day : Factor w/ 2 levels "weekday","weekend": 1 1 2 1 2 1 2 1 1 2 ...
newavgstepsinterval=aggregate(steps~interval+day,activitynew,mean) # avg step per interval by type of day
library(lattice)
xyplot(steps~interval|day,
newavgstepsinterval,
type="l",
main="Avg Steps Per 5 min Interval Over All Days",
xlab="Time Intervals",
ylab="Avg Steps Per 5 min Interval \nOver All Days",
col="red",
layout=c(1,2))
A first visual evaluation of the plot indicates that except for the 500 to 1000th interval, the activity level for weekdays and weekends seems to be relatively similar. Additional analysis would be required to understand the difference in the 500 to 1000th interval.