Skip to content

Nicolecasing/Getting-and-Cleaning-Data

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

9 Commits
 
 
 
 

Repository files navigation

Getting-and-Cleaning-Data

library(dplyr)

#download the zip file if it hasn't been downloaded

URL <- "https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip" getdataURL <- "UCI HAR Dataset.zip"

if(!file.exists(getdataURL)) { download.file(URL, getdataURL, mode = "wd") }

#unzip the zip file

dataPath <- "UCI HAR Dataset"

if (!file.exists(dataPath)) { unzip(getdataURL) }

##Read the data

#read training data

trainingSubject <- read.table(file.path(dataPath, "train", "subject_train.txt"))

trainingValues <- read.table(file.path(dataPath, "train", "X_train.txt"))

trainingActivity <- read.table(file.path(dataPath, "train", "y_train.txt"))

#read test data

testSubject <- read.table(file.path(dataPath, "test", "subject_test.txt"))

testValues <- read.table(file.path(dataPath, "test", "X_test.txt"))

testActivity <- read.table(file.path(dataPath, "test", "y_test.txt"))

#read features, don't convert text labels to factors

features <- read.table(file.path(dataPath, "features.txt"), as.is = TRUE)

#read activity labels

activities <- read.table(file.path(dataPath, "activity_labels.txt"))

colnames(activities) <- c("activityId", "activityLabel")

##STEP 1 - merge the train and test data sets into a single data table

Activities <- rbind ( cbind(trainingSubject, trainingValues, trainingActivity), cbind(testSubject, testValues, testActivity) )

#remove the individual data tables to save some memory

rm(trainingSubjects, trainingValues, trainingActivity, testSubjects, testValues, testActivity)

#assign column names

colnames(Activity) <- c("subject", features[, 2], "activity")

##STEP 2 - Only extract the mean and standard deviation for each measurement

retainCol <- grepl("subject|activity|mean|std", colnames(Activity))

Activity <- Activity[, retainCol]

##describe each activity

Activity$activity <- factor(Activity$activity, levels = activities[, 1], labels = activities[, 2])

##STEP 3 - label the descriptive variable names

#column names

ActivityCols <- colnames(Activity)

#expand abbreviations and clean up names

ActivityCols <- gsub("^f", "frequencyDomain", ActivityCols)

ActivityCols <- gsub("^t", "timeDomain", ActivityCols)

ActivityCols <- gsub("Acc", "Accelerometer", ActivityCols)

ActivityCols <- gsub("Gyro", "Gyroscope", ActivityCols)

ActivityCols <- gsub("Mag", "Magnitude", ActivityCols)

ActivityCols <- gsub("Freq", "Frequency", ActivityCols)

ActivityCols <- gsub("mean", "Mean", ActivityCols)

ActivityCols <- gsub("std", "StandardDeviation", ActivityCols)

##STEP 4 - Create a second, independent tidy set with the aveage of each variable for each ##activity and each subject

ActivityMeans <- humanActivity %>% group_by(subject, activity) %>% summarise_each(funs(mean))

##make a text file "tidy_data.text"

write.table(ActivityMeans, "tidy_data.txt", row.names = FALSE, quote = FALSE)

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages