前言

最近在学习深度学习,就用DNN试着跑了个天池赛二手车价格预测,特征还是用之前集成模型跑的特征,通过不断调试模型的学习率、隐藏层数量、神经元数量、优化器、激活函数、迭代次数、batchsize,KFold,最终达到与之前集成模型差不多的分数,但训练时间比catboost及lightgbm要快很多,毕竟只用了很少的迭代次数就能达到差不多的效果,接着在与集成模型进行融合,将之前的成绩从422提高到406,算是一次DNN的练习吧,毕竟模型的上限还是取决于特征工程,下面附上DNN完整代码,经过交叉验证取平均,线上可以达到428左右,需要的朋友自取。

import pandas as pd

import numpy as np

import Meancoder

from datetime import datetime

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold

import torch

import torch.nn as nn

from torch.autograd import Variable

df = pd.read_csv('/train.csv', sep=' ')

test= pd.read_csv('/test.csv', sep=' ')

def date_process(x):

year = int(str(x)[:4])

month = int(str(x)[4:6])

day = int(str(x)[6:8])

if month < 1:

month = 1

date = datetime(year, month, day)

return date

df['regDate'] = df['regDate'].apply(date_process)

df['creatDate'] = df['creatDate'].apply(date_process)

df['regDate_year'] = df['regDate'].dt.year

df['regDate_month'] = df['regDate'].dt.month

df['regDate_day'] = df['regDate'].dt.day

df['creatDate_year'] = df['creatDate'].dt.year

df['creatDate_month'] = df['creatDate'].dt.month

df['creatDate_day'] = df['creatDate'].dt.day

df['car_age_day'] = (df['creatDate'] - df['regDate']).dt.days

df['car_age_year'] = round(df['car_age_day'] / 365, 1)

df['notRepairedDamage']=df['notRepairedDamage'].replace('-',0.0).astype('float64')

df['power'][df['power']>600] = 600

df['power'][df['power']<1] = 1

df['v_13'][df['v_13']>6] = 6

df['v_14'][df['v_14']>4] = 4

df['fuelType'] = df['fuelType'].fillna(0)

df['gearbox'] = df['gearbox'].fillna(0)

df['bodyType'] = df['bodyType'].fillna(0)

df['model'] = df['model'].fillna(0)

test['regDate'] = test['regDate'].apply(date_process)

test['creatDate'] = test['creatDate'].apply(date_process)

test['regDate_year'] = test['regDate'].dt.year

test['regDate_month'] = test['regDate'].dt.month

test['regDate_day'] = test['regDate'].dt.day

test['creatDate_year'] = test['creatDate'].dt.year

test['creatDate_month'] = test['creatDate'].dt.month

test['creatDate_day'] = test['creatDate'].dt.day

test['car_age_day'] = (test['creatDate'] - test['regDate']).dt.days

test['car_age_year'] = round(test['car_age_day'] / 365, 1)

test['notRepairedDamage']=test['notRepairedDamage'].replace('-',0).astype('float64')

test['power'][test['power']>600] = 600

test['power'][test['power']<1] = 1

test['v_13'][test['v_13']>6] = 6

test['v_14'][test['v_14']>4] = 4

test['fuelType'] = test['fuelType'].fillna(0)

test['gearbox'] = test['gearbox'].fillna(0)

test['bodyType'] = test['bodyType'].fillna(0)

test['model'] = test['model'].fillna(0)

num_cols = [0,2,3,6,8,10,12,14]

for index, value in enumerate(num_cols):

for j in num_cols[index+1:]:

df['new'+str(value)+'*'+str(j)]=df['v_'+str(value)]*df['v_'+str(j)]

df['new'+str(value)+'+'+str(j)]=df['v_'+str(value)]+df['v_'+str(j)]

df['new'+str(value)+'-'+str(j)]=df['v_'+str(value)]-df['v_'+str(j)]

test['new'+str(value)+'*'+str(j)]=test['v_'+str(value)]*test['v_'+str(j)]

test['new'+str(value)+'+'+str(j)]=test['v_'+str(value)]+test['v_'+str(j)]

test['new'+str(value)+'-'+str(j)]=test['v_'+str(value)]-test['v_'+str(j)]

for i in range(15):

df['new'+str(i)+'*year']=df['v_'+str(i)] * df['car_age_year']

test['new'+str(i)+'*year']=test['v_'+str(i)] * test['car_age_year']

num_cols1 = [3,5,1,11]

for index, value in enumerate(num_cols1):

for j in num_cols1[index+1:]:

df['new'+str(value)+'-'+str(j)]=df['v_'+str(value)]-df['v_'+str(j)]

test['new'+str(value)+'-'+str(j)]=test['v_'+str(value)]-test['v_'+str(j)]

X=df.drop(columns=['price','SaleID','seller','offerType', 'name','creatDate','regionCode','regDate'])

test=test.drop(columns=['SaleID','seller','offerType', 'name','creatDate','regionCode','regDate'])

Y=df['price']

class_list = ['model','brand','power','v_0','v_3','v_8','v_12']#+date_cols 'v_6','v_10','v_14','v_2'

MeanEnocodeFeature = class_list

ME = Meancoder.MeanEncoder(MeanEnocodeFeature,target_type='regression')

X = ME.fit_transform(X,Y)

test = ME.transform(test)

df_concat = pd.concat([X, test], ignore_index = True)

df_concat=StandardScaler().fit_transform(df_concat)

X1=df_concat[:150000]

test1=df_concat[150000:]

# 模型设置

input_size = 143

hidden_size = 320

num_classes = 1

batch_size = 2048

learning_rate = 0.05

x=torch.tensor(X1,dtype=torch.float32)

y=torch.FloatTensor(Y.to_numpy())

y=Variable(y.view(-1, 1))

test=torch.tensor(test1,dtype=torch.float32)

class Net(nn.Module):

def __init__(self, input_size, hidden_size, num_classes):

super(Net, self).__init__()

self.fc1 = nn.Linear(input_size, hidden_size)

self.relu = nn.ReLU()

self.fc2 = nn.Linear(hidden_size, num_classes)

def forward(self, x):

out = self.fc1(x)

out = self.relu(out)

out = self.fc2(out)

return out

net = Net(input_size, hidden_size, num_classes)

print(net)

criterion = nn.L1Loss()

optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

result = []

mean_score = 0

n_folds=5

kf = KFold(n_splits=n_folds ,shuffle=True,random_state=2022)

for train_index, test_index in kf.split(x):

x_train = x[train_index]

y_train = y[train_index]

x_test = x[test_index]

y_test = y[test_index]

for i in range(2000):

for start in range(0, len(x_train), batch_size):

end = start + batch_size if start + batch_size < len(x_train) else len(x_train)

xx = x_train[start:end]

yy = y_train[start:end]

outputs = net(xx)

loss = criterion(outputs, yy)

net.zero_grad()

loss.backward()

optimizer.step()

y_pred = net.forward(x_test)

loss1 = criterion(y_test, y_pred)

mean_score += loss1.item()/ n_folds

print('验证集loss:{}'.format(loss1.item()))

test_pred = net.forward(test)

result.append(test_pred)

# 模型评估

print('mean 验证集Auc:{}'.format(mean_score))

cat_pre=sum(result)/n_folds

cat_pre=cat_pre.detach().numpy()

ret=pd.DataFrame(cat_pre,columns=['price'])

ret.to_csv('/DNN.csv')

参考文章

评论可见,请评论后查看内容,谢谢!!!
 您阅读本篇文章共花了: