本文实现了R语言构建随机森林模型并且进行K-折交叉验证。K-折交叉验证即将原始数据分成K等分,每次选择1份作为测试集,另外k-1份为训练集进行建模,最终精度为k次建模精度的均值。本文以十折交叉验证为例,例子不是分类是回归。如果一次结果不太满意,可以修改max_seed,代码实现了这一迭代,最后会保留最佳的训练结果。

感谢评论指出的错误,代码已经改正过了。

# 保留最高r2的种子

bestm = 0

# 保留最高r2

bestr2 = 0

# 原始csv数据路径,建议仅保留自变量和因变量,去掉索引号

dir = "这里是原始数据的路径.csv"

# 定义最大迭代随机种子数值,默认为3,等于分别设置种子为1、2、3运行三次,并保留模型精度最高的种子作为最终结果,max_seed根据个人需求自行修改。该代码迭代所有种子,最后再运行一次精度最高的种子作为输出,如果设置为1只是想运行一次,该代码就显得非常累赘低效,请自行修改。

max_seed = 3

# 数据拆分份数

k = 10

for (m in 1:max_seed){

print(paste('set seed = ',m))

# 设置随机种子

set.seed(m)

# 读取数据

data = read.csv(dir)

# 转换为dataframe

data = as.data.frame(data)

#获取长度

len = nrow(data)

#定义拆分函数mysplit

mysplit = function(k,len){

# 数据索引池

pool = c(1:len)

#每份的长度

seg = as.integer(len/k)

# 定义训练集合测试集的索引数据框,如果len不能被k整除会舍弃零头数据

train = as.data.frame(matrix(nrow = (len - seg)))

test = as.data.frame(matrix(nrow = seg))

for (i in 1 : k){

# 测试集索引

ctest = sample(pool,seg,replace = FALSE)

# 赋值

train[i] = setdiff(c(1:len),ctest)

test[i] = ctest

# 不放回抽样,更新索引池

pool = setdiff(pool,ctest)

}

# 返回索引数据框

out = list(one=train, two=test)

return(out)

}

# 调用函数,从out提取出train和test索引数据框

split1 = mysplit(k,len)

train = split1$one

test = split1$two

# 定义验证精度的累加值,循环结束后取均值

total_train = 0

total_test = 0

# 开始循环

for(i in 1:k){

# 根据索引拆分训练测试集

train_actual = data[unlist(train[i]),]

test_actual = data[unlist(test[i]),]

# 定义输出路径,为了避免冗余,输出csv这部分放到了代码最后,这里都注释掉了

# path_test_actual = paste ("C:/Users/Administrator/Desktop/test_actual",i,".csv",sep = "", collapse = NULL)

# path_train_actual = paste ("C:/Users/Administrator/Desktop/train_actual",i,".csv",sep = "", collapse = NULL)

# path_train_predict = paste ("C:/Users/Administrator/Desktop/train_predict",i,".csv",sep = "", collapse = NULL)

# path_test_predict = paste ("C:/Users/Administrator/Desktop/test_predict",i,".csv",sep = "", collapse = NULL)

# 随机森林建模,请自行进行模型的调试

library(randomForest)

rf = randomForest(这里写原数据里因变量的名称不要带引号 ~ ., data=train_actual)

# 训练测试集预测结果

train_predict = predict(rf, train_actual)

test_predict = predict(rf, test_actual)

# 输出csv,方便数据检视

# write.csv(train_actual, file=path_train_actual)

# write.csv(test_actual, file=path_test_actual)

# write.csv(train_predict, file = path_train_predict)

# write.csv(test_predict , file = path_test_predict)

# ---- 计算测试集预测精度 ----

# 1. 'Actual' and 'Predicted' data

#

df <- data.frame(

y_actual = test_actual[[这里写原数据里因变量的名称要带引号]],

y_predict = test_predict)

# 下面是计算过程

# 2. Average of actual data

avr_y_actual <- mean(df$y_actual)

# 3. Total sum of squares

ss_total <- sum((df$y_actual - avr_y_actual)^2)

# 4. Regression sum of squares

ss_regression <- sum((df$y_predict - avr_y_actual)^2)

# 5. Residual sum of squares

ss_residuals <- sum((df$y_actual - df$y_predict)^2)

# 6. R2 Score

r_test <- 1 - ss_residuals / ss_total

# 输出一下k以及测试精度R2

# print(paste ("第",i,"折测试集r2:",r_test,sep='',collapse = NULL))

# print(r_test)

# ---- 计算训练集预测精度 ----

# 1. 'Actual' and 'Predicted' data

# --这里的6记得修改,对应h_test.csv参数y所在的列

df <- data.frame(

y_actual = train_actual[[这里写原数据里因变量的名称要带引号]],

y_predict = train_predict)

# 下面是计算过程

# 2. Average of actual data

avr_y_actual <- mean(df$y_actual)

# 3. Total sum of squares

ss_total <- sum((df$y_actual - avr_y_actual)^2)

# 4. Regression sum of squares

ss_regression <- sum((df$y_predict - avr_y_actual)^2)

# 5. Residual sum of squares

ss_residuals <- sum((df$y_actual - df$y_predict)^2)

# 6. R2 Score

r_train <- 1 - ss_residuals / ss_total#train R2

# 输出训练精度R2

# print(paste ("第",i,"折训练集r2:",r_train,sep='',collapse = NULL))

# 精度累加值更新

total_train = total_train + r_train

total_test = total_test + r_test

}

# 计算R2平均值

total_train = total_train/k

total_test = total_test/k

# 输出

print(paste('最终测试集r2:',total_test))

print(paste('最终训练集r2:',total_train))

# 如果该种子为最佳结果,则保留结果并输出

if (total_test > bestr2){

bestr2 = total_test

bestm = m

}

jindu = m/(max_seed/100)

print(paste('当前已运行',round(jindu,1),'%'))

}

# print(paste('最高r2种子为:',bestm))

# print(paste('最高r2为:',bestr2))

# ----------------------------最后输出最高r2的csv预测结果---------------------------

# ----------------------------最后输出最高r2的csv预测结果---------------------------

# ----------------------------最后输出最高r2的csv预测结果---------------------------

# 设置随机种子

set.seed(bestm)

#定义拆分函数mysplit

mysplit = function(k,len){

# 数据索引池

pool = c(1:len)

#每份的长度

seg = as.integer(len/k)

# 定义训练集合测试集的索引数据框,如果len不能被k整除会舍弃零头数据

train = as.data.frame(matrix(nrow = (len - seg)))

test = as.data.frame(matrix(nrow = seg))

for (i in 1 : k){

# 测试集索引

ctest = sample(pool,seg,replace = FALSE)

# 赋值

train[i] = setdiff(c(1:len),ctest)

test[i] = ctest

# 不放回抽样,更新索引池

pool = setdiff(pool,ctest)

}

# 返回索引数据框

out = list(one=train, two=test)

return(out)

}

split1 = mysplit(k,len)

train = split1$one

test = split1$two

# 开始循环

total_predict = data.frame()

total_actual = data.frame()

for(i in 1:k){

# 根据索引拆分训练测试集

train_actual = data[unlist(train[i]),]

test_actual = data[unlist(test[i]),]

# 定义输出路径

path_test_actual = paste ("C:/Users/Administrator/Desktop/test_actual",i,".csv",sep = "", collapse = NULL)

# path_train_actual = paste ("C:/Users/Administrator/Desktop/train_actual",i,".csv",sep = "", collapse = NULL)

# path_train_predict = paste ("C:/Users/Administrator/Desktop/train_predict",i,".csv",sep = "", collapse = NULL)

path_test_predict = paste ("C:/Users/Administrator/Desktop/test_predict",i,".csv",sep = "", collapse = NULL)

# 随机森林建模,请自行进行模型的调试

library(randomForest)

rf = randomForest(这里写原数据里因变量的名称不要带引号 ~ ., data=train_actual)

# 训练测试集预测结果

train_predict = predict(rf, train_actual)

test_predict = predict(rf, test_actual)

# 输出csv,方便数据检视

# write.csv(train_actual, file=path_train_actual)

write.csv(test_actual, file=path_test_actual)

# write.csv(train_predict, file = path_train_predict)

write.csv(test_predict , file = path_test_predict)

total_predict = rbind(total_predict,read.csv(path_test_predict))

total_actual = rbind(total_actual,read.csv(path_test_actual))

}

total = merge(total_actual,total_predict)

# 输出k次建模的测试集实际值和预测值

write.csv(total, 'C:/Users/Administrator/Desktop/total.csv')

print(paste('最高r2种子为:',bestm))

print(paste('最高r2为:',bestr2))

print('done')

已经很傻瓜操作了,需要修改的代码有:1)交叉验证折数 k 的值; 2)原始csv数据的路径; 3)因变量的参数名;4)随机种子最大值 max_seed 。模型的调试请自行尝试。

如果遇到问题或者提供建议,欢迎留言,也可联系QQ:1262840380

好文阅读

评论可见,请评论后查看内容,谢谢!!!
 您阅读本篇文章共花了: