Feature Engineering and Analysis
import library
library(ggplot2)
library(randomForest)
library(caret)
Kaggle Titanic data
test <- read.csv(file.choose(),header = T)
train <- read.csv(file.choose(),header = T)
test every variable
attach(train)
summary(Pclass)
train$Pclass <-as.factor(Pclass)
plot
ggplot(train,aes(Pclass,fill = as.factor(Survived)))+geom_bar()
Pclass has a strong relation with Survived
summary(Sex)
train$Sex <- as.factor(Sex)
ggplot(train,aes(Sex,fill = as.factor(Survived)))+geom_bar()
Sex has a strong relation with Survived
ggplot(train,aes(Pclass,fill = as.factor(Survived)))+
facet_grid(. ~ Sex)+
geom_bar()
ggplot(train,aes(Sex,fill = as.factor(Survived)))+
facet_grid(. ~ Pclass)+
geom_bar()
Sex & Pclass has a strong relation with sruvived rate
check next variabe
summary(Age)
too many NAs in age variable,fillna later
summary(SibSp)
boxplot(SibSp)
someone shows up with a large spouse numbers?
outliers.sibsp <- train[which(SibSp > 2),]
check to see if they owned same ticket number
length(unique(Ticket)) == length(nrow(train))
False, means some people owned same tickets
plot sibsp vs survived
ggplot(train,aes(SibSp,fill = as.factor(Survived)))+
geom_bar()
see that more sibsp, more chance to die, if sibsp more than 5,all died
combined 3 variable together
ggplot(train,aes(SibSp,fill = as.factor(Survived)))+
facet_grid(. ~ Pclass+ Sex)+
geom_bar()
hold sibsp, check parch
summary(Parch)
boxplot(Parch)
ggplot(train,aes(Parch,fill = as.factor(Survived)))+
geom_bar()
make a new variable family.size
family.size <- Parch + SibSp +1
summary(family.size)
boxplot(family.size)
check family.size vs survived
ggplot(train,aes(family.size,fill = as.factor(Survived)))+
geom_bar()
see that family.size >= 8,no ppl survived
chech family.size >= 8
train[family.size >= 8, ]
if family.size >=8, no survivers, even females
check test if theres family size >= 8
test[which(test$Parch+test$SibSp+1 >= 8),]
family.size is a good variable
check next variable ticket, extract capital
ticket.level <- as.factor(substr(Ticket,1,1))
check ticket.level vs survived
ggplot(train,aes(ticket.level,fill = as.factor(Survived)))+
facet_grid(. ~ Pclass + Sex)+
geom_bar()
No significant influence, pass it
check fare
summary(Fare)
boxplot(Fare)
some outlier here
pick up guys with high fare
outliers.fare <- train[which(Fare > 180),]
check their Pclass
summary(outliers.fare$Pclass)
all 3 classes have high fare
check fare vs survive rate
ggplot(train,aes(sort(Fare),fill = as.factor(Survived)))+
geom_density(alpha = 0.2)
table(outliers.fare$Survived)
no significant influence
check embarked
table(Embarked)
ggplot(train,aes(Embarked,fill = as.factor(Survived)))+
facet_grid(. ~ Pclass + Sex)+
geom_bar()
embarked seems ok
Modeling
Random Forest
set up all data
test.survived <- cbind(PassengerId=test[,1],
Survived = rep("None",nrow(test)),test[,-1])
data.combined <- rbind(train,test.survived)
data.combined$family.size <- as.factor(data.combined$Parch + data.combined$SibSp+ 1)
check label
rf.label <-as.factor(train$Survived)
Select different columns
log 1
rf.train.1 <- data.combined[ 1:891,c("Pclass","Embarked","family.size")]
rf.1 <- randomForest(x = rf.train.1 , y = rf.label,ntree = 1000)
rf.1
log 2
rf.train.2 <- data.combined[ 1:891,c("Pclass","family.size")]
rf.2 <- randomForest(x = rf.train.2 , y = rf.label,ntree = 1000)
rf.2
log 3
rf.train.3 <- data.combined[ 1:891,c("Pclass","family.size","Embarked","Sex")]
rf.3 <- randomForest(x = rf.train.3 , y = rf.label,ntree = 1000)
rf.3
log 4
rf.train.4 <- data.combined[ 1:891,c("Pclass","family.size","Sex")]
rf.4 <- randomForest(x = rf.train.4 , y = rf.label,ntree = 1000)
rf.4
Log 4 <- rf.4 is the best model for thess all tests
set up predict model
rf.results.1<-predict(rf.4,data.combined[892:1309,c("Pclass","family.size","Sex")])
Kaggle score is 0.77512, which model score is 0.8081
try set better model
retry all variables
detach(train)
attach(data.combined)
FAMILY.SIZE is not ATTACH
check Pclass
ggplot(train,aes(Pclass,fill = as.factor(Survived)))+
geom_bar()
check family.size
ggplot(train,aes(family.size,fill = as.factor(Survived)))+
geom_bar()
check details from name
name.spilt <- strsplit(as.character(Name),",")
first.name <-sapply(name.spilt,"[",1)
other.name <-sapply(name.spilt,"[",2)
other.name.spilt <- strsplit(other.name,". ")
title <- sapply(other.name.spilt,"[",1)
data.combined$title <- as.factor(title)
check if title is a good variable
ggplot(data.combined[1:891,],aes(title,fill = as.factor(Survived)))+
geom_bar()
show something interest
check male officer title
capt <- data.combined[which(data.combined$title == " Capt"),]
col <- data.combined[which(data.combined$title == " Col"),]
don <- data.combined[which(data.combined$title == " Don"),]
dr <- data.combined[which(data.combined$title == " Dr"),]
major <- data.combined[which(data.combined$title == " Major"),]
jonkeer <- data.combined[which(data.combined$title == " Jonkheer"),]
check master
master <- data.combined[which(data.combined$title == " Master"),]
all masters are male with age <= 14.5, which i think should have more survived rate
table(master$Survived)
23/40 more than 50% survived rate
check mlle
mlle <- data.combined[which(data.combined$title == " Mlle"),]
2 mlle, female, both survived
check mme
mme <- data.combined[which(data.combined$title == " Mme"),]
1 mme, female, survived
check ms
ms <- data.combined[which(data.combined$title == " Ms"),]
1 mme, female, survived
check sir
sir <- data.combined[which(data.combined$title == " Sir"),]
1 sir, male, survived
check th
th <- data.combined[which(data.combined$title == " th"),]
1 th, female survived
makeup new variable new.title
data.combined$new.title <- rep("NA",1309)
data.combined$new.title[which(data.combined$title %in% c(" Capt"," Col",
" Don"," Dr"," Jonkheer"," Major"," Rev"))]<-"Officer"
data.combined$new.title[which(data.combined$title %in% c(" Lady"," Miss"," Mlle"," Dona",
" Mme"," Mrs"," Ms"," th"))]<- "Mrs"
data.combined$new.title[which(data.combined$title == c(" Master"))] <-"Master"
data.combined$new.title[which(data.combined$title %in% c(" Sir"," Mr"))] <- "Mr"
unique(data.combined$new.title)
chech new.title vs survived rate
ggplot(data.combined[1:891,],aes(new.title[1:891],fill = Survived))+
facet_grid(. ~ Pclass + family.size)+
geom_bar()
looks not bad
set up new random forest
data.combined$new.title <- as.factor(data.combined$new.title)
rf.train.5 <- data.combined[1:891,c("Pclass","family.size","new.title")]
rf.5 <- randomForest(x= rf.train.5,y = rf.label,ntree = 1000)
varImpPlot(rf.5)
perdict data and submit : rf5
rf.results.5 <- predict(rf.5,data.combined[892:1309,c("Pclass","family.size","new.title")])
write.csv(data.frame(PassengerId = c(892:1309), Survived = rf.results.5),
file = "Kaggle_2016_9_9_sub_2.csv", row.names = F)
83.39% correct rate, not bad
score = 0.79426, a little better
try something more
think about age, pass it first time since so many NAs
summary(data.combined$Age)
wanna see if age has relation with survival rate
non.na.age <- train[which(train$Age != "NA"),]
ggplot(non.na.age,aes(as.factor(Age >= 53.5),fill = as.factor(Survived)))+
facet_grid(. ~ Sex)+
geom_bar()
ggplot(non.na.age,aes(as.factor(Age <= 14.5),fill = as.factor(Survived)))+
facet_grid(. ~ Sex)+
geom_bar()
ggplot(non.na.age,aes(as.factor(Age),fill = as.factor(Survived)))+
facet_grid(. ~ Sex)+
geom_bar()
rf6
rf.train.6 <- data.combined[1:891,c("Pclass","family.size","new.title","new.fare")]
rf.6 <- randomForest(x= rf.train.6,y = rf.label,ntree = 1000)
varImpPlot(rf.6)
83.73% rf7
rf.train.7 <- data.combined[1:891,c("Pclass","family.size","new.title","Embarked")]
rf.7 <- randomForest(x= rf.train.7,y = rf.label,ntree = 1000)
varImpPlot(rf.7)
81.71%, not good, delete embarked rf8
rf.train.8 <- data.combined[1:891,c("Pclass","family.size","new.title","Sex")]
rf.8 <- randomForest(x= rf.train.8,y = rf.label,ntree = 1000)
varImpPlot(rf.8)
83.5%
try rf.6, but fare has a NA
data.combined$new.fare <- data.combined$Fare
data.combined[which(is.na(data.combined$new.fare)),"new.fare"] <- "7.25"
submit rf.6
rf.results.6 <- predict(rf.6,data.combined[892:1309,c("Pclass","family.size","new.title","new.fare")])
write.csv(data.frame(PassengerId = c(892:1309), Survived = rf.results.6),
file = "Kaggle_2016_9_10_sub_2.csv", row.names = F)
score is 0.78947
ggplot(data.combined[1:891,],aes(as.factor(new.title),fill = as.factor(Survived)))+
facet_grid(. ~ Pclass + family.size)+
geom_bar()
set up new variable new.age
data.combined$new.age <- data.combined$Age
assign median age to missing variable
med.dr <- median(data.combined$Age[which(data.combined$title == " Dr" & data.combined$Age != "NA")])
data.combined$new.age[which(data.combined$title == " Dr" & is.na(data.combined$Age))] <-med.dr
mas.dr <- median(data.combined$Age[which(data.combined$title == " Master" & data.combined$Age != "NA")])
data.combined$new.age[which(data.combined$title == " Master" & is.na(data.combined$Age))] <-mas.dr
med.mis <- median(data.combined$Age[which(data.combined$title == " Miss" & data.combined$Age != "NA")])
data.combined$new.age[which(data.combined$title == " Miss" & is.na(data.combined$Age))] <-med.mis
med.mr <- median(data.combined$Age[which(data.combined$title == " Mr" & data.combined$Age != "NA")])
data.combined$new.age[which(data.combined$title == " Mr" & is.na(data.combined$Age))] <-med.mr
med.mrs <- median(data.combined$Age[which(data.combined$title == " Mrs" & data.combined$Age != "NA")])
data.combined$new.age[which(data.combined$title == " Mrs" & is.na(data.combined$Age))] <-med.mrs
med.ms <- median(data.combined$Age[which(data.combined$title == " Ms" & data.combined$Age != "NA")])
data.combined$new.age[which(data.combined$title == " Ms" & is.na(data.combined$Age))] <-med.ms
try pclass family.size new.title and new.age
rf10
rf.train.10 <- data.combined[1:891,c("Pclass","family.size","new.title","new.age")]
rf.10 <- randomForest(x= rf.train.10,y = rf.label,ntree = 1000)
varImpPlot(rf.10)
rf.results.10 <- predict(rf.10,data.combined[892:1309,c("Pclass","family.size","new.title","new.age")])
write.csv(data.frame(PassengerId = c(892:1309), Survived = rf.results.10),
file = "Kaggle_2016_9_10_sub_3.csv", row.names = F)
0.76077 rf11
rf.train.11 <- data.combined[1:891,c("Pclass","new.fare","new.title","new.age")]
rf.11 <- randomForest(x= rf.train.11,y = rf.label,importance = T,ntree = 1000)
varImpPlot(rf.11)
rf.results.11 <- predict(rf.11,data.combined[892:1309,c("Pclass","new.fare","new.title","new.age")])
write.csv(data.frame(PassengerId = c(892:1309), Survived = rf.results.11),
file = "Kaggle_2016_9_10_sub_4.csv", row.names = F)
0.77033
remap age
data.combined$new.age <- "NA"
data.combined$new.age <- data.combined$Age
none.na.age <- data.combined[which(!is.na(data.combined$new.age)),]
new.age.label <- cut(none.na.age$new.age,breaks = c(0.25,2.5,6,10,14,17,28,40,48,55,65,72,84),
labels = c("A","B","C","D","E","F","G","H","I","J","K","L"))
data.combined$factor.age <- ifelse(!is.na(data.combined$new.age),new.age.label,"NA")
factor.age.rf.label <-(none.na.age$new.age)
rf.train.age <- none.na.age[,c("SibSp","Parch","new.title","family.size")]
rf.train <- randomForest(x=rf.train.age,y = factor.age.rf.label,ntree = 1000)
write csv and submit
write.csv(data.frame(PassengerId = c(892:1309),Survived = rf.results.6),
file = "Kaggle_2016_9_9_sub_1.csv",row.names = F)