python实现随机森林预测算法——以预测房价为例

参考博文:​​​​​python随机森林回归数据实战_随机森林回归参数-CSDN博客

用Python实现随机森林回归_逐步回归分析可以用随机森林拟合么-CSDN博客

使用的包,最后一列是用于指定graphviz的路径的,根据不同的情况修改

import sklearn.datasets as datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
import numpy as np
import os
import matplotlib.pyplot as plt
os.environ["PATH"]+=os.pathsep+'C:/Program Files (x86)/Graphviz2.38/bin/'

模型部分:

#利用随机森林进行训练
forest = RandomForestRegressor(
    n_estimators=3000,
    random_state=1,
    n_jobs=-1)
forest.fit(x_train,y_train)

导入数据部分:

firstdata = np.fromfile('F:/RandomForest/housing.data', sep=' ')
# 添加属性
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT',
                     'MEDV']
# 列的长度
feature_num = len(feature_names)
# print(firstdata.shape)  输出结果:(7084, )
# print(firstdata.shape[0] // feature_nums)  输出结果:506
#构造506*14的二维数组
dataset = firstdata.reshape([firstdata.shape[0] // feature_num, feature_num])
X = dataset[:, 0:13]
Y = dataset[:, 13]
#进行数据集划分
x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.3)

完整代码:

#判断是否有空值
import sklearn.datasets as datasets
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
import numpy as np
import os
import matplotlib.pyplot as plt
os.environ["PATH"]+=os.pathsep+'C:/Program Files (x86)/Graphviz2.38/bin/'
####出现了好多找不到模块的问题,原因在于,这个环境下的各种包,莫名奇妙的被升级了,按照那些找到模块的提示,一个个重新卸载后安装就可以了。numpy、matplotlib、pillow
firstdata = np.fromfile('F:/RandomForest/housing.data', sep=' ')
# 添加属性
feature_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT',
                     'MEDV']
# 列的长度
feature_num = len(feature_names)
# print(firstdata.shape)  输出结果:(7084, )
# print(firstdata.shape[0] // feature_nums)  输出结果:506
#构造506*14的二维数组
dataset = firstdata.reshape([firstdata.shape[0] // feature_num, feature_num])
X = dataset[:, 0:13]
Y = dataset[:, 13]
#进行数据集划分
x_train,x_test,y_train,y_test = train_test_split(X, Y, test_size=0.3)
#利用随机森林进行训练
forest = RandomForestRegressor(
    n_estimators=3000,
    random_state=1,
    n_jobs=-1)
forest.fit(x_train,y_train)
#外部预测数据导入
datafile1 = 'F:/boston-house-price-forecast-master/boston-house-price-forecast-master/yuce.data'  
feature_names1 = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
data1 = np.fromfile(datafile1, sep=' ') 
feature_num1 = len(feature_names1)
d = data1.reshape([data1.shape[0] // feature_num1, feature_num1])
result1 = forest.predict(d)
#预测结果可视化
score = forest.score(x_test, y_test)
#测试集
#result = forest.predict(x_test)
plt.figure()
#测试集可视化
# plt.plot(np.arange(100), y_test[:100], "go-", label="True value")
# plt.plot(np.arange(100), result[:100], "ro-", label="Predict value")
plt.plot(np.arange(200), Y[:200], "go-", label="True value")
plt.plot(np.arange(200), result1[:200], "ro-", label="Predict value")
plt.title(f"RandomForest---score:{score}")
plt.legend(loc="best")
plt.show()
np.savetxt('test1.txt',y_test,fmt='%s')
np.savetxt('result1.txt',d,fmt='%s')
# 分数
y_pred = forest.predict(x_test)
# 下面对训练好的随机森林,完成重要性评估
# feature_importances_  可以调取关于特征重要程度
importances = forest.feature_importances_
print("重要性:", importances)
x_columns = ['OD_distance','OD_speed','OD_speedcv','route_num','OD_distancecv','crossroadcv','speed','speedcv','mean_crossroad']
# 返回数组从大到小的索引值
indices = np.argsort(importances)[::-1]
for f in range(x_train.shape[1]):
    # 对于最后需要逆序排序,我认为是做了类似决策树回溯的取值,从叶子收敛
    # 到根,根部重要程度高于叶子。
    print("%2d) %-*s %f" % (f + 1, 30,feature_names[indices[f]], importances[indices[f]]))
# 筛选变量(选择重要性比较高的变量)
threshold = 0.15
x_selected = x_train[:, importances > threshold]
# 可视化
plt.figure(figsize=(10, 6))
plt.title("importance of  feature  in dateset", fontsize=18)
plt.ylabel("import level", fontsize=15, rotation=90)
x_columns1 = [x_columns[i] for i in indices]
for i in range(len(x_columns)):
    plt.bar(i, importances[indices[i]], color='orange', align='center')
    plt.xticks(np.arange(len(x_columns)), x_columns1, fontsize=10, rotation=30)
plt.show()

运行结果: