Machine Learning in Random Forest in R

  Demo in Titanic Data

Posted by Haby on June 12, 2017

Feature Engineering and Analysis

import library

library(ggplot2)
library(randomForest)
library(caret)

Kaggle Titanic data

test <- read.csv(file.choose(),header = T)
train <- read.csv(file.choose(),header = T)

test every variable

attach(train)
summary(Pclass)
train$Pclass <-as.factor(Pclass)

plot

ggplot(train,aes(Pclass,fill = as.factor(Survived)))+geom_bar()
Pclass has a strong relation with Survived

summary(Sex)
train$Sex <- as.factor(Sex)
ggplot(train,aes(Sex,fill = as.factor(Survived)))+geom_bar()
Sex has a strong relation with Survived

ggplot(train,aes(Pclass,fill = as.factor(Survived)))+
    facet_grid(. ~ Sex)+
    geom_bar()

ggplot(train,aes(Sex,fill = as.factor(Survived)))+
    facet_grid(. ~ Pclass)+
    geom_bar()
Sex & Pclass has a strong relation with sruvived rate

check next variabe

summary(Age)
too many NAs in age variable,fillna later

summary(SibSp)
boxplot(SibSp)

someone shows up with a large spouse numbers?    
outliers.sibsp <- train[which(SibSp > 2),]

check to see if they owned same ticket number

length(unique(Ticket)) == length(nrow(train))
False, means some people owned same tickets

plot sibsp vs survived

ggplot(train,aes(SibSp,fill = as.factor(Survived)))+
    geom_bar()
see that more sibsp, more chance to die, if sibsp more than 5,all died

combined 3 variable together

ggplot(train,aes(SibSp,fill = as.factor(Survived)))+
    facet_grid(. ~ Pclass+ Sex)+
    geom_bar()

hold sibsp, check parch

summary(Parch)
boxplot(Parch)
ggplot(train,aes(Parch,fill = as.factor(Survived)))+
    geom_bar()

make a new variable family.size

family.size <- Parch + SibSp +1
summary(family.size)
boxplot(family.size)

check family.size vs survived

ggplot(train,aes(family.size,fill = as.factor(Survived)))+
    geom_bar()

see that family.size >= 8,no ppl survived

chech family.size >= 8
train[family.size >= 8, ]
if family.size >=8, no survivers, even females

check test if theres family size >= 8

test[which(test$Parch+test$SibSp+1 >= 8),]

family.size is a good variable

check next variable ticket, extract capital

ticket.level <- as.factor(substr(Ticket,1,1))

check ticket.level vs survived

ggplot(train,aes(ticket.level,fill = as.factor(Survived)))+
    facet_grid(. ~ Pclass + Sex)+
    geom_bar()
No significant influence, pass it

check fare

summary(Fare)
boxplot(Fare)
some outlier here

pick up guys with high fare

outliers.fare <- train[which(Fare > 180),]

check their Pclass

summary(outliers.fare$Pclass)
all 3 classes have high fare

check fare vs survive rate

ggplot(train,aes(sort(Fare),fill = as.factor(Survived)))+
    geom_density(alpha = 0.2)

table(outliers.fare$Survived)
no significant influence

check embarked

table(Embarked)
ggplot(train,aes(Embarked,fill = as.factor(Survived)))+
    facet_grid(. ~ Pclass + Sex)+
    geom_bar()
embarked seems ok

Modeling

Random Forest

set up all data
test.survived <- cbind(PassengerId=test[,1],
                       Survived = rep("None",nrow(test)),test[,-1])
data.combined <- rbind(train,test.survived)
data.combined$family.size <- as.factor(data.combined$Parch + data.combined$SibSp+ 1)

check label
rf.label <-as.factor(train$Survived)

Select different columns

log 1
rf.train.1 <- data.combined[ 1:891,c("Pclass","Embarked","family.size")]
rf.1 <- randomForest(x = rf.train.1 , y = rf.label,ntree = 1000)
rf.1

log 2
rf.train.2 <- data.combined[ 1:891,c("Pclass","family.size")]
rf.2 <- randomForest(x = rf.train.2 , y = rf.label,ntree = 1000)
rf.2

log 3
rf.train.3 <- data.combined[ 1:891,c("Pclass","family.size","Embarked","Sex")]
rf.3 <- randomForest(x = rf.train.3 , y = rf.label,ntree = 1000)
rf.3

log 4
rf.train.4 <- data.combined[ 1:891,c("Pclass","family.size","Sex")]
rf.4 <- randomForest(x = rf.train.4 , y = rf.label,ntree = 1000)
rf.4

Log 4 <- rf.4 is the best model for thess all tests
set up predict model
rf.results.1<-predict(rf.4,data.combined[892:1309,c("Pclass","family.size","Sex")])

Kaggle score is 0.77512, which model score is 0.8081

try set better model

retry all variables

detach(train)
attach(data.combined)
FAMILY.SIZE is not ATTACH

check Pclass

ggplot(train,aes(Pclass,fill = as.factor(Survived)))+
    geom_bar()

check family.size

ggplot(train,aes(family.size,fill = as.factor(Survived)))+
    geom_bar()

check details from name

name.spilt <- strsplit(as.character(Name),",")
first.name <-sapply(name.spilt,"[",1)
other.name <-sapply(name.spilt,"[",2)
other.name.spilt <- strsplit(other.name,". ")
title <- sapply(other.name.spilt,"[",1)
data.combined$title <- as.factor(title)

check if title is a good variable

ggplot(data.combined[1:891,],aes(title,fill = as.factor(Survived)))+
    geom_bar()

show something interest
check male officer title

capt <- data.combined[which(data.combined$title == " Capt"),]
col <- data.combined[which(data.combined$title == " Col"),]
don <- data.combined[which(data.combined$title == " Don"),]
dr <- data.combined[which(data.combined$title == " Dr"),]
major <- data.combined[which(data.combined$title == " Major"),]
jonkeer <- data.combined[which(data.combined$title == " Jonkheer"),]

check master

master <- data.combined[which(data.combined$title == " Master"),]
all masters are male with age <= 14.5, which i think should have more survived rate

table(master$Survived)
23/40 more than 50% survived rate

check mlle

mlle <- data.combined[which(data.combined$title == " Mlle"),]
2 mlle, female, both survived

check mme

mme <- data.combined[which(data.combined$title == " Mme"),]
1 mme, female, survived

check ms

ms <- data.combined[which(data.combined$title == " Ms"),]
1 mme, female, survived

check sir

sir <- data.combined[which(data.combined$title == " Sir"),]
1 sir, male, survived

check th

th <- data.combined[which(data.combined$title == " th"),]
1 th, female survived

makeup new variable new.title

data.combined$new.title <- rep("NA",1309)
data.combined$new.title[which(data.combined$title %in% c(" Capt"," Col",
                                                         " Don"," Dr"," Jonkheer"," Major"," Rev"))]<-"Officer"
data.combined$new.title[which(data.combined$title %in% c(" Lady"," Miss"," Mlle"," Dona",
                                                         " Mme"," Mrs"," Ms"," th"))]<- "Mrs"
data.combined$new.title[which(data.combined$title == c(" Master"))] <-"Master"
data.combined$new.title[which(data.combined$title %in% c(" Sir"," Mr"))] <- "Mr"
unique(data.combined$new.title)

chech new.title vs survived rate

ggplot(data.combined[1:891,],aes(new.title[1:891],fill = Survived))+
    facet_grid(. ~ Pclass + family.size)+
    geom_bar()
looks not bad

set up new random forest

data.combined$new.title <- as.factor(data.combined$new.title)
rf.train.5 <- data.combined[1:891,c("Pclass","family.size","new.title")]
rf.5 <- randomForest(x= rf.train.5,y = rf.label,ntree = 1000)
varImpPlot(rf.5)

perdict data and submit : rf5

rf.results.5 <- predict(rf.5,data.combined[892:1309,c("Pclass","family.size","new.title")])

write.csv(data.frame(PassengerId = c(892:1309), Survived = rf.results.5),
          file = "Kaggle_2016_9_9_sub_2.csv", row.names = F)

83.39% correct rate, not bad
score = 0.79426, a little better

try something more

think about age, pass it first time since so many NAs
summary(data.combined$Age)

wanna see if age has relation with survival rate

non.na.age <- train[which(train$Age != "NA"),]
ggplot(non.na.age,aes(as.factor(Age >= 53.5),fill = as.factor(Survived)))+
    facet_grid(. ~ Sex)+
    geom_bar()
ggplot(non.na.age,aes(as.factor(Age <= 14.5),fill = as.factor(Survived)))+
    facet_grid(. ~ Sex)+
    geom_bar()
ggplot(non.na.age,aes(as.factor(Age),fill = as.factor(Survived)))+
    facet_grid(. ~ Sex)+
    geom_bar()

rf6

rf.train.6 <- data.combined[1:891,c("Pclass","family.size","new.title","new.fare")]
rf.6 <- randomForest(x= rf.train.6,y = rf.label,ntree = 1000)
varImpPlot(rf.6)

83.73% rf7

rf.train.7 <- data.combined[1:891,c("Pclass","family.size","new.title","Embarked")]
rf.7 <- randomForest(x= rf.train.7,y = rf.label,ntree = 1000)
varImpPlot(rf.7)

81.71%, not good, delete embarked rf8

rf.train.8 <- data.combined[1:891,c("Pclass","family.size","new.title","Sex")]
rf.8 <- randomForest(x= rf.train.8,y = rf.label,ntree = 1000)
varImpPlot(rf.8)

83.5%

try rf.6, but fare has a NA
data.combined$new.fare <- data.combined$Fare
data.combined[which(is.na(data.combined$new.fare)),"new.fare"] <- "7.25"

submit rf.6

rf.results.6 <- predict(rf.6,data.combined[892:1309,c("Pclass","family.size","new.title","new.fare")])

write.csv(data.frame(PassengerId = c(892:1309), Survived = rf.results.6),
          file = "Kaggle_2016_9_10_sub_2.csv", row.names = F)

score is 0.78947

ggplot(data.combined[1:891,],aes(as.factor(new.title),fill = as.factor(Survived)))+
    facet_grid(. ~ Pclass + family.size)+
    geom_bar()

set up new variable new.age

data.combined$new.age <- data.combined$Age

assign median age to missing variable

med.dr <- median(data.combined$Age[which(data.combined$title == " Dr" & data.combined$Age != "NA")])
data.combined$new.age[which(data.combined$title == " Dr" & is.na(data.combined$Age))] <-med.dr

mas.dr <- median(data.combined$Age[which(data.combined$title == " Master" & data.combined$Age != "NA")])
data.combined$new.age[which(data.combined$title == " Master" & is.na(data.combined$Age))] <-mas.dr

med.mis <- median(data.combined$Age[which(data.combined$title == " Miss" & data.combined$Age != "NA")])
data.combined$new.age[which(data.combined$title == " Miss" & is.na(data.combined$Age))] <-med.mis

med.mr <- median(data.combined$Age[which(data.combined$title == " Mr" & data.combined$Age != "NA")])
data.combined$new.age[which(data.combined$title == " Mr" & is.na(data.combined$Age))] <-med.mr

med.mrs <- median(data.combined$Age[which(data.combined$title == " Mrs" & data.combined$Age != "NA")])
data.combined$new.age[which(data.combined$title == " Mrs" & is.na(data.combined$Age))] <-med.mrs

med.ms <- median(data.combined$Age[which(data.combined$title == " Ms" & data.combined$Age != "NA")])
data.combined$new.age[which(data.combined$title == " Ms" & is.na(data.combined$Age))] <-med.ms

try pclass family.size new.title and new.age

rf10

rf.train.10 <- data.combined[1:891,c("Pclass","family.size","new.title","new.age")]
rf.10 <- randomForest(x= rf.train.10,y = rf.label,ntree = 1000)
varImpPlot(rf.10)

rf.results.10 <- predict(rf.10,data.combined[892:1309,c("Pclass","family.size","new.title","new.age")])

write.csv(data.frame(PassengerId = c(892:1309), Survived = rf.results.10),
          file = "Kaggle_2016_9_10_sub_3.csv", row.names = F)

0.76077 rf11

rf.train.11 <- data.combined[1:891,c("Pclass","new.fare","new.title","new.age")]
rf.11 <- randomForest(x= rf.train.11,y = rf.label,importance = T,ntree = 1000)
varImpPlot(rf.11)

rf.results.11 <- predict(rf.11,data.combined[892:1309,c("Pclass","new.fare","new.title","new.age")])

write.csv(data.frame(PassengerId = c(892:1309), Survived = rf.results.11),
          file = "Kaggle_2016_9_10_sub_4.csv", row.names = F)

0.77033

remap age

data.combined$new.age <- "NA"
data.combined$new.age <- data.combined$Age
none.na.age <- data.combined[which(!is.na(data.combined$new.age)),]


new.age.label <- cut(none.na.age$new.age,breaks = c(0.25,2.5,6,10,14,17,28,40,48,55,65,72,84),
                     labels = c("A","B","C","D","E","F","G","H","I","J","K","L"))

data.combined$factor.age <- ifelse(!is.na(data.combined$new.age),new.age.label,"NA")
factor.age.rf.label <-(none.na.age$new.age)

rf.train.age <- none.na.age[,c("SibSp","Parch","new.title","family.size")]

rf.train <- randomForest(x=rf.train.age,y = factor.age.rf.label,ntree = 1000)

write csv and submit

write.csv(data.frame(PassengerId = c(892:1309),Survived = rf.results.6),
          file = "Kaggle_2016_9_9_sub_1.csv",row.names = F)