# -*- coding: utf-8 -*-
"""
Created on Thu Jul 17 17:56:05 2025

@author: 梁世杰
"""

"""
1. 正文中的表2、图2、图3，附录中图1、图3
实证部分基准模型-创新水平（使用industry1数据）
"""

#导入相关程序包
import matplotlib.pyplot as plt
import numpy
import pandas
import random
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.inspection import PartialDependenceDisplay

#导入数据，1-5列分别命名为lp、tl、ts、inno、inno1
data = pandas.read_excel("industry1.xlsx")
x=data.iloc[:,5:]
x1=data.iloc[:,7:]
lp=data.iloc[:,0]
tl=data.iloc[:,1]
ts=data.iloc[:,2]
inno=data.iloc[:,3]
inno1=data.iloc[:,4]
#将省份和年份设置为虚拟变量
x=pandas.get_dummies(x,columns=['id'],prefix=[""])
x=pandas.get_dummies(x,columns=['year'],prefix=[""])
random.seed(123)
#按照7：3划分测试集与训练集
x_train,x_test,inno_train,inno_test=train_test_split(x,inno,test_size=0.3,random_state=1)

###随机森林---inno
#每一节点随机选择总变量数的1/3作为候选分裂变量
max_features=int(x_train.shape[1]/3)
model=RandomForestRegressor(n_estimators=500,max_features=max_features,random_state=0)
model.fit(x_train, inno_train)
#输出拟合优度
print(model.score(x_train, inno_train))
print(model.score(x_test, inno_test))
#绘制拟合图
pred=model.predict(x_test)
plt.scatter(pred, inno_test, alpha=0.6,s=15,c='k')
plt.xlabel('测试集预测值',fontdict={'family':'SimSun','size':12})
plt.ylabel('测试集实际值',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
w=numpy.linspace(min(pred),max(pred),100)
plt.plot(w,w,'k-')
plt.savefig('fig1.jpg',dpi=300)
plt.close()
#绘制部分依赖图
fig2=PartialDependenceDisplay.from_estimator(model, x, features=["ENG"])
ax = fig2.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('恩格尔系数',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig2.jpg',dpi=300)
plt.close()
fig3=PartialDependenceDisplay.from_estimator(model, x, features=["CS"])
ax = fig3.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('高端消费占比',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig3.jpg',dpi=300)
plt.close()
fig4=PartialDependenceDisplay.from_estimator(model, x, features=["TC"])
ax = fig4.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('消费规模',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig4.jpg',dpi=300)
plt.close()
fig5=PartialDependenceDisplay.from_estimator(model, x, features=["INC"])
ax = fig5.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('收入水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig5.jpg',dpi=300)
plt.close()
#绘制12个变量的重要性程度图（4个解释变量，8个控制变量）
ar=model.feature_importances_
arr=ar[:12]
sorted_index=arr.argsort()
plt.barh(range(x1.shape[1]),model.feature_importances_[sorted_index],color='black')
plt.yticks(numpy.arange(x1.shape[1]),x1.columns[sorted_index],fontdict={'family':'Times New Roman','size':12})
plt.xlabel('变量重要性',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig6.jpg',dpi=300)
plt.close()

#XGBoost---inno  执行时需要取消本部分多行注释符号，并对随机森林方法部分加上多行注释符号
model=xgb.XGBRegressor(objective="reg:squarederror",n_estimators=500, \
max_depth=6,subsample=1,colsample_bytree=0.8,learning_rate=0.1,random_state=0)
model.fit(x_train,inno_train)
print(model.score(x_train, inno_train))
print(model.score(x_test, inno_test))
pred=model.predict(x_test)
plt.scatter(pred, inno_test, alpha=0.6,s=15,c='k')
plt.xlabel('测试集预测值',fontdict={'family':'SimSun','size':12})
plt.ylabel('测试集实际值',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
w=numpy.linspace(min(pred),max(pred),100)
plt.plot(w,w,'k-')
plt.savefig('fig1.jpg',dpi=300)
plt.close()
fig2=PartialDependenceDisplay.from_estimator(model, x, features=["ENG"])
ax = fig2.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('恩格尔系数',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig2.jpg',dpi=300)
plt.close()
fig3=PartialDependenceDisplay.from_estimator(model, x, features=["CS"])
ax = fig3.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('高端消费占比',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig3.jpg',dpi=300)
plt.close()
fig4=PartialDependenceDisplay.from_estimator(model, x, features=["TC"])
ax = fig4.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('消费规模',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig4.jpg',dpi=300)
plt.close()
fig5=PartialDependenceDisplay.from_estimator(model, x, features=["INC"])
ax = fig5.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('收入水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig5.jpg',dpi=300)
plt.close()
ar=model.feature_importances_
arr=ar[:12]
sorted_index=arr.argsort()
plt.barh(range(x1.shape[1]),model.feature_importances_[sorted_index],color='black')
plt.yticks(numpy.arange(x1.shape[1]),x1.columns[sorted_index],fontdict={'family':'Times New Roman','size':12})
plt.xlabel('变量重要性',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig6.jpg',dpi=300)
plt.close()


"""
2. 正文中的表2、图4、图5、图6，附录中图2、图4
实证部分基准模型-产业升级（使用industry2数据）
"""

#导入相关程序包
import matplotlib.pyplot as plt
import numpy
import pandas
import random
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.inspection import PartialDependenceDisplay
from matplotlib.ticker import FuncFormatter 

#导入数据，1-3列分别命名为lp、tl、ts
data = pandas.read_excel("industry2.xlsx")
x=data.iloc[:,3:]
x1=data.iloc[:,5:]
lp=data.iloc[:,0]
tl=data.iloc[:,1]
ts=data.iloc[:,2]
#将省份和年份设置为虚拟变量
x=pandas.get_dummies(x,columns=['id'],prefix=[""])
x=pandas.get_dummies(x,columns=['year'],prefix=[""])
random.seed(123)
#按照7：3划分测试集与训练集
x_train,x_test,lp_train,lp_test=train_test_split(x,lp,test_size=0.3,random_state=1)

###随机森林---lp/inno
max_features=int(x_train.shape[1]/3)
model=RandomForestRegressor(n_estimators=500,max_features=max_features,random_state=0)
model.fit(x_train, lp_train)
#输出拟合优度
print(model.score(x_train, lp_train))
print(model.score(x_test, lp_test))
#绘制拟合图
pred=model.predict(x_test)
plt.scatter(pred, lp_test, alpha=0.6,s=15,c='k')
plt.xlabel('测试集预测值',fontdict={'family':'SimSun','size':12})
plt.ylabel('测试集实际值',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
w=numpy.linspace(min(pred),max(pred),100)
plt.plot(w,w,'k-')
plt.savefig('fig1.jpg',dpi=300)
plt.close()
#绘制部分依赖图
fig2=PartialDependenceDisplay.from_estimator(model, x, features=["ENG"])
ax = fig2.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('恩格尔系数',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig2.jpg',dpi=300)
plt.close()
fig3=PartialDependenceDisplay.from_estimator(model, x, features=["CS"])
ax = fig3.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('高端消费占比',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig3.jpg',dpi=300)
plt.close()
fig4=PartialDependenceDisplay.from_estimator(model, x, features=["TC"])
ax = fig4.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('消费规模',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig4.jpg',dpi=300)
plt.close()
fig5=PartialDependenceDisplay.from_estimator(model, x, features=["INC"])
ax = fig5.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('收入水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig5.jpg',dpi=300)
plt.close()
fig6=PartialDependenceDisplay.from_estimator(model, x, features=["INNO"])
ax = fig6.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('创新水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig6.jpg',dpi=300)
plt.close()
#绘制13个变量的重要性程度图（inno、4个解释变量，8个控制变量）
ar=model.feature_importances_
arr=ar[:13]
sorted_index=arr.argsort()
plt.barh(range(x1.shape[1]),model.feature_importances_[sorted_index],color='black')
plt.yticks(numpy.arange(x1.shape[1]),x1.columns[sorted_index],fontdict={'family':'Times New Roman','size':12})
plt.xlabel('变量重要性',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig7.jpg',dpi=300)
plt.close()


#XGBoost---lp/inno  执行时需要取消本部分多行注释符号，并对随机森林方法部分加上多行注释符号
model=xgb.XGBRegressor(objective="reg:squarederror",n_estimators=500, \
max_depth=6,subsample=1,colsample_bytree=0.8,learning_rate=0.1,random_state=0)
model.fit(x_train,lp_train)
print(model.score(x_train, lp_train))
print(model.score(x_test, lp_test))
pred=model.predict(x_test)
plt.scatter(pred, lp_test, alpha=0.6,s=15,c='k')
plt.xlabel('测试集预测值',fontdict={'family':'SimSun','size':12})
plt.ylabel('测试集实际值',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
w=numpy.linspace(min(pred),max(pred),100)
plt.plot(w,w,'k-')
plt.savefig('fig1.jpg',dpi=300)
plt.close()
fig2=PartialDependenceDisplay.from_estimator(model, x, features=["ENG"])
ax = fig2.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('恩格尔系数',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig2.jpg',dpi=300)
plt.close()
fig3=PartialDependenceDisplay.from_estimator(model, x, features=["CS"])
ax = fig3.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('高端消费占比',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig3.jpg',dpi=300)
plt.close()
fig4=PartialDependenceDisplay.from_estimator(model, x, features=["TC"])
ax = fig4.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('消费规模',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig4.jpg',dpi=300)
plt.close()
fig5=PartialDependenceDisplay.from_estimator(model, x, features=["INC"])
ax = fig5.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('收入水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig5.jpg',dpi=300)
plt.close()
fig6=PartialDependenceDisplay.from_estimator(model, x, features=["INNO"])
ax = fig6.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('创新水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig6.jpg',dpi=300)
plt.close()
ar=model.feature_importances_
arr=ar[:13]
sorted_index=arr.argsort()
plt.barh(range(x1.shape[1]),model.feature_importances_[sorted_index],color='black')
plt.yticks(numpy.arange(x1.shape[1]),x1.columns[sorted_index],fontdict={'family':'Times New Roman','size':12})
plt.xlabel('变量重要性',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig7.jpg',dpi=300)
plt.close()

"""
3. 附录中的表3
稳健性检验1-创新水平（使用industry1数据）
"""

#导入相关程序包
import pandas
import random
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split

#导入数据，1-5列分别命名为lp、tl、ts、inno、inno1
data = pandas.read_excel("industry1.xlsx")
x=data.iloc[:,5:]
x1=data.iloc[:,7:]
lp=data.iloc[:,0]
tl=data.iloc[:,1]
ts=data.iloc[:,2]
inno=data.iloc[:,3]
inno1=data.iloc[:,4]
x=pandas.get_dummies(x,columns=['id'],prefix=[""])
x=pandas.get_dummies(x,columns=['year'],prefix=[""])

#按照6：4划分测试集与训练集
random.seed(123)
x_train,x_test,inno_train,inno_test=train_test_split(x,inno,test_size=0.4,random_state=1)
max_features=int(x_train.shape[1]/3)
model=RandomForestRegressor(n_estimators=500,max_features=max_features,random_state=0)
model.fit(x_train, inno_train)
print(model.score(x_train, inno_train))
print(model.score(x_test, inno_test))

model=xgb.XGBRegressor(objective="reg:squarederror",n_estimators=500, \
max_depth=6,subsample=1,colsample_bytree=0.8,learning_rate=0.1,random_state=0)
model.fit(x_train,inno_train)
print(model.score(x_train, inno_train))
print(model.score(x_test, inno_test))

#按照5：5划分测试集与训练集
random.seed(123)
x_train,x_test,inno_train,inno_test=train_test_split(x,inno,test_size=0.5,random_state=1)
max_features=int(x_train.shape[1]/3)
model=RandomForestRegressor(n_estimators=500,max_features=max_features,random_state=0)
model.fit(x_train, inno_train)
print(model.score(x_train, inno_train))
print(model.score(x_test, inno_test))

model=xgb.XGBRegressor(objective="reg:squarederror",n_estimators=500, \
max_depth=6,subsample=1,colsample_bytree=0.8,learning_rate=0.1,random_state=0)
model.fit(x_train,inno_train)
print(model.score(x_train, inno_train))
print(model.score(x_test, inno_test))

"""
4. 附录中的表4
稳健性检验1-产业升级（使用industry2数据）
"""

#导入相关程序包
import pandas
import random
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split

#导入数据，1-3列分别命名为lp、tl、ts
data = pandas.read_excel("industry2.xlsx")
x=data.iloc[:,3:]
x1=data.iloc[:,5:]
lp=data.iloc[:,0]
tl=data.iloc[:,1]
ts=data.iloc[:,2]
x=pandas.get_dummies(x,columns=['id'],prefix=[""])
x=pandas.get_dummies(x,columns=['year'],prefix=[""])

#按照6：4划分测试集与训练集
random.seed(123)
x_train,x_test,lp_train,lp_test=train_test_split(x,lp,test_size=0.4,random_state=1)
max_features=int(x_train.shape[1]/3)
model=RandomForestRegressor(n_estimators=500,max_features=max_features,random_state=0)
model.fit(x_train, lp_train)
print(model.score(x_train, lp_train))
print(model.score(x_test, lp_test))

model=xgb.XGBRegressor(objective="reg:squarederror",n_estimators=500, \
max_depth=6,subsample=1,colsample_bytree=0.8,learning_rate=0.1,random_state=0)
model.fit(x_train,lp_train)
print(model.score(x_train, lp_train))
print(model.score(x_test, lp_test))

#按照5：5划分测试集与训练集
random.seed(123)
x_train,x_test,lp_train,lp_test=train_test_split(x,lp,test_size=0.5,random_state=1)
max_features=int(x_train.shape[1]/3)
model=RandomForestRegressor(n_estimators=500,max_features=max_features,random_state=0)
model.fit(x_train, lp_train)
print(model.score(x_train, lp_train))
print(model.score(x_test, lp_test))

model=xgb.XGBRegressor(objective="reg:squarederror",n_estimators=500, \
max_depth=6,subsample=1,colsample_bytree=0.8,learning_rate=0.1,random_state=0)
model.fit(x_train,lp_train)
print(model.score(x_train, lp_train))
print(model.score(x_test, lp_test))

"""
5. 正文中的图7，附录中表5
稳健性检验2-创新水平（使用industry1数据）
"""

#导入相关程序包
import matplotlib.pyplot as plt
import numpy
import pandas
import random
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.inspection import PartialDependenceDisplay

#导入数据，1-5列分别命名为lp、tl、ts、inno、inno1
data = pandas.read_excel("industry1.xlsx")
x=data.iloc[:,5:]
x1=data.iloc[:,7:]
lp=data.iloc[:,0]
tl=data.iloc[:,1]
ts=data.iloc[:,2]
inno=data.iloc[:,3]
inno1=data.iloc[:,4]
#将省份和年份设置为虚拟变量
x=pandas.get_dummies(x,columns=['id'],prefix=[""])
x=pandas.get_dummies(x,columns=['year'],prefix=[""])
random.seed(123)
#按照7：3划分测试集与训练集
x_train,x_test,inno1_train,inno1_test=train_test_split(x,inno1,test_size=0.3,random_state=1)


###随机森林---inno1
#每一节点随机选择总变量数的1/3作为候选分裂变量   
max_features=int(x_train.shape[1]/3)
model=RandomForestRegressor(n_estimators=500,max_features=max_features,random_state=0)
model.fit(x_train, inno1_train)
#输出拟合优度
print(model.score(x_train, inno1_train))
print(model.score(x_test, inno1_test))
#绘制拟合图
pred=model.predict(x_test)
plt.scatter(pred, inno1_test, alpha=0.6,s=15,c='k')
plt.xlabel('测试集预测值',fontdict={'family':'SimSun','size':12})
plt.ylabel('测试集实际值',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gcf().subplots_adjust(left=0.15)
w=numpy.linspace(min(pred),max(pred),100)
plt.plot(w,w,'k-')
plt.savefig('fig1.jpg',dpi=300)
plt.close()
#绘制部分依赖图
fig2=PartialDependenceDisplay.from_estimator(model, x, features=["ENG"])
ax = fig2.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('恩格尔系数',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig2.jpg',dpi=300)
plt.close()
fig3=PartialDependenceDisplay.from_estimator(model, x, features=["CS"])
ax = fig3.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('高端消费占比',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig3.jpg',dpi=300)
plt.close()
fig4=PartialDependenceDisplay.from_estimator(model, x, features=["TC"])
ax = fig4.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('消费规模',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig4.jpg',dpi=300)
plt.close()
fig5=PartialDependenceDisplay.from_estimator(model, x, features=["INC"])
ax = fig5.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('收入水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig5.jpg',dpi=300)
plt.close()
#绘制12个变量的重要性程度图（4个解释变量，8个控制变量）
ar=model.feature_importances_
arr=ar[:12]
sorted_index=arr.argsort()
plt.barh(range(x1.shape[1]),model.feature_importances_[sorted_index],color='black')
plt.yticks(numpy.arange(x1.shape[1]),x1.columns[sorted_index],fontdict={'family':'Times New Roman','size':12})
plt.xlabel('变量重要性',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig6.jpg',dpi=300)
plt.close()


#XGBoost---inno1  执行时需要取消本部分多行注释符号，并对随机森林方法部分加上多行注释符号
model=xgb.XGBRegressor(objective="reg:squarederror",n_estimators=500, \
max_depth=6,subsample=1,colsample_bytree=0.8,learning_rate=0.1,random_state=0)
model.fit(x_train,inno1_train)
print(model.score(x_train, inno1_train))
print(model.score(x_test, inno1_test))
pred=model.predict(x_test)
plt.scatter(pred, inno1_test, alpha=0.6,s=15,c='k')
plt.xlabel('测试集预测值',fontdict={'family':'SimSun','size':12})
plt.ylabel('测试集实际值',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gcf().subplots_adjust(left=0.15)
w=numpy.linspace(min(pred),max(pred),100)
plt.plot(w,w,'k-')
plt.savefig('fig1.jpg',dpi=300)
plt.close()
fig2=PartialDependenceDisplay.from_estimator(model, x, features=["ENG"])
ax = fig2.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('恩格尔系数',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig2.jpg',dpi=300)
plt.close()
fig3=PartialDependenceDisplay.from_estimator(model, x, features=["CS"])
ax = fig3.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('高端消费占比',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig3.jpg',dpi=300)
plt.close()
fig4=PartialDependenceDisplay.from_estimator(model, x, features=["TC"])
ax = fig4.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('消费规模',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig4.jpg',dpi=300)
plt.close()
fig5=PartialDependenceDisplay.from_estimator(model, x, features=["INC"])
ax = fig5.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('收入水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig5.jpg',dpi=300)
plt.close()
ar=model.feature_importances_
arr=ar[:12]
sorted_index=arr.argsort()
plt.barh(range(x1.shape[1]),model.feature_importances_[sorted_index],color='black')
plt.yticks(numpy.arange(x1.shape[1]),x1.columns[sorted_index],fontdict={'family':'Times New Roman','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig6.jpg',dpi=300)
plt.close()

"""
6. 正文中的图8，附录中表5
稳健性检验2-产业升级（使用industry3数据）
"""

#导入相关程序包
import matplotlib.pyplot as plt
import numpy
import pandas
import random
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.inspection import PartialDependenceDisplay

#导入数据，1-3列分别命名为lp、tl、ts
data = pandas.read_excel("industry3.xlsx")
x=data.iloc[:,3:]
x1=data.iloc[:,5:]
lp=data.iloc[:,0]
tl=data.iloc[:,1]
ts=data.iloc[:,2]
#将省份和年份设置为虚拟变量
x=pandas.get_dummies(x,columns=['id'],prefix=[""])
x=pandas.get_dummies(x,columns=['year'],prefix=[""])
random.seed(123)
#按照7：3划分测试集与训练集
x_train,x_test,ts_train,ts_test=train_test_split(x,ts,test_size=0.3,random_state=1)


###随机森林--ts/inno1
#每一节点随机选择总变量数的1/3作为候选分裂变量
max_features=int(x_train.shape[1]/3)
model=RandomForestRegressor(n_estimators=500,max_features=max_features,random_state=0)
model.fit(x_train, ts_train)
#输出拟合优度
print(model.score(x_train, ts_train))
print(model.score(x_test, ts_test))
#绘制拟合图
pred=model.predict(x_test)
plt.scatter(pred, ts_test, alpha=0.6,s=15,c='k')
plt.xlabel('测试集预测值',fontdict={'family':'SimSun','size':12})
plt.ylabel('测试集实际值',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
w=numpy.linspace(min(pred),max(pred),100)
plt.plot(w,w,'k-')
plt.savefig('fig1.jpg',dpi=300)
plt.close()
#绘制部分依赖图
fig2=PartialDependenceDisplay.from_estimator(model, x, features=["ENG"])
ax = fig2.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('恩格尔系数',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig2.jpg',dpi=300)
plt.close()
fig3=PartialDependenceDisplay.from_estimator(model, x, features=["CS"])
ax = fig3.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('高端消费占比',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig3.jpg',dpi=300)
plt.close()
fig4=PartialDependenceDisplay.from_estimator(model, x, features=["TC"])
ax = fig4.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('消费规模',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig4.jpg',dpi=300)
plt.close()
fig5=PartialDependenceDisplay.from_estimator(model, x, features=["INC"])
ax = fig5.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('收入水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig5.jpg',dpi=300)
plt.close()
fig6=PartialDependenceDisplay.from_estimator(model, x, features=["INNO1"])
ax = fig6.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('创新水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig6.jpg',dpi=300)
plt.close()
#绘制13个变量的重要性程度图（inno1，4个解释变量，8个控制变量）
ar=model.feature_importances_
arr=ar[:13]
sorted_index=arr.argsort()
plt.barh(range(x1.shape[1]),model.feature_importances_[sorted_index],color='black')
plt.yticks(numpy.arange(x1.shape[1]),x1.columns[sorted_index],fontdict={'family':'Times New Roman','size':12})
plt.xlabel('变量重要性',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig7.jpg',dpi=300)
plt.close()


#XGBoost---ts/inno1 执行时需要取消本部分多行注释符号，并对随机森林方法部分加上多行注释符号
model=xgb.XGBRegressor(objective="reg:squarederror",n_estimators=500, \
max_depth=6,subsample=1,colsample_bytree=0.8,learning_rate=0.1,random_state=0)
model.fit(x_train,ts_train)
print(model.score(x_train, ts_train))
print(model.score(x_test, ts_test))
pred=model.predict(x_test)
plt.scatter(pred, ts_test, alpha=0.6,s=15,c='k')
plt.xlabel('测试集预测值',fontdict={'family':'SimSun','size':12})
plt.ylabel('测试集实际值',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
w=numpy.linspace(min(pred),max(pred),100)
plt.plot(w,w,'k-')
plt.savefig('fig1.jpg',dpi=300)
plt.close()
fig2=PartialDependenceDisplay.from_estimator(model, x, features=["ENG"])
ax = fig2.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('恩格尔系数',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig2.jpg',dpi=300)
plt.close()
fig3=PartialDependenceDisplay.from_estimator(model, x, features=["CS"])
ax = fig3.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('高端消费占比',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig3.jpg',dpi=300)
plt.close()
fig4=PartialDependenceDisplay.from_estimator(model, x, features=["TC"])
ax = fig4.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('消费规模',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig4.jpg',dpi=300)
plt.close()
fig5=PartialDependenceDisplay.from_estimator(model, x, features=["INC"])
ax = fig5.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('收入水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig5.jpg',dpi=300)
plt.close()
fig6=PartialDependenceDisplay.from_estimator(model, x, features=["INNO1"])
ax = fig6.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('创新水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig6.jpg',dpi=300)
plt.close()
ar=model.feature_importances_
arr=ar[:13]
sorted_index=arr.argsort()
plt.barh(range(x1.shape[1]),model.feature_importances_[sorted_index],color='black')
plt.yticks(numpy.arange(x1.shape[1]),x1.columns[sorted_index],fontdict={'family':'Times New Roman','size':12})
plt.xlabel('变量重要性',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.savefig('fig7.jpg',dpi=300)
plt.close()

"""
7. 附录中的表6、图5
稳健性检验3-创新水平（使用industry1数据）
"""

#导入相关程序包
import numpy as np
import matplotlib.pyplot as plt
import pandas
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.inspection import PartialDependenceDisplay

#导入数据，1-5列分别命名为lp、tl、ts、inno、inno1
data = pandas.read_excel("industry1.xlsx")
x=data.iloc[:,5:]
x1=data.iloc[:,7:]
lp=data.iloc[:,0]
tl=data.iloc[:,1]
ts=data.iloc[:,2]
inno=data.iloc[:,3]
inno1=data.iloc[:,4]
#将省份和年份设置为虚拟变量
x=pandas.get_dummies(x,columns=['id'],prefix=[""])
x=pandas.get_dummies(x,columns=['year'],prefix=[""])
#按照7：3划分测试集与训练集
x_train,x_test,inno_train,inno_test=train_test_split(x,inno,test_size=0.3,random_state=1)

###支持向量机---inno
model=SVR(kernel='rbf')
scaler=StandardScaler()
scaler.fit(x_train)
x_train_s=scaler.transform(x_train)
x_test_s=scaler.transform(x_test)
model.fit(x_train_s, inno_train)
#输出拟合优度
print(model.score(x_train_s, inno_train))
print(model.score(x_test_s, inno_test))
#得到模型的所有参数
model.get_params()
#通过迭代得到最优模型
param_grid={'C':[0.01,0.1,1,10,50,100,150],'epsilon':[0.01,0.1,1,10],'gamma':[0.01,0.1,1,10]}
kfold=KFold(n_splits=10,shuffle=(True),random_state=(1))
model=GridSearchCV(SVR(), param_grid,cv=kfold)
model.fit(x_train_s,inno_train)
model=model.best_estimator_
#输出最优模型的拟合优度
print(model.score(x_train_s, inno_train))
print(model.score(x_test_s, inno_test))
#绘制部分依赖图
fig1=PartialDependenceDisplay.from_estimator(model,x_train_s,[0], feature_names=["ENG","CS","TC","INC"])
ax = fig1.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('恩格尔系数',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.tight_layout()
plt.savefig('fig1.jpg',dpi=300)
plt.close()

fig2=PartialDependenceDisplay.from_estimator(model,x_train_s,[1], feature_names=["ENG","CS","TC","INC"])
ax = fig2.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('高端消费占比',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.tight_layout()
plt.savefig('fig2.jpg',dpi=300)
plt.close()

fig3=PartialDependenceDisplay.from_estimator(model,x_train_s,[2], feature_names=["ENG","CS","TC","INC"])
ax = fig3.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('消费规模',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.tight_layout()
plt.savefig('fig3.jpg',dpi=300)
plt.close()

fig4=PartialDependenceDisplay.from_estimator(model,x_train_s,[3], feature_names=["ENG","CS","TC","INC"])
ax = fig4.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('收入水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.tight_layout()
plt.savefig('fig4.jpg',dpi=300)
plt.close()

#绘制拟合图
pred=model.predict(x_test_s)
plt.scatter(pred, inno_test, alpha=0.6,s=15,c='k')
plt.xlabel('测试集预测值',fontdict={'family':'SimSun','size':12})
plt.ylabel('测试集实际值',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
w=np.linspace(min(pred),max(pred),100)
plt.plot(w,w,'k-')
plt.savefig('fig5.jpg',dpi=300)
plt.close()

"""
8. 附录中的表6、图6
稳健性检验3-产业升级（使用industry2数据）
"""

#导入相关程序包
import numpy as np
import matplotlib.pyplot as plt
import pandas
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.inspection import PartialDependenceDisplay
#导入数据，1-3列分别命名为lp、tl、ts
data = pandas.read_excel("industry2.xlsx")
x=data.iloc[:,3:]
x1=data.iloc[:,5:]
lp=data.iloc[:,0]
tl=data.iloc[:,1]
ts=data.iloc[:,2]
#将省份和年份设置为虚拟变量
x=pandas.get_dummies(x,columns=['id'],prefix=[""])
x=pandas.get_dummies(x,columns=['year'],prefix=[""])
#按照7：3划分测试集与训练集
x_train,x_test,lp_train,lp_test=train_test_split(x,lp,test_size=0.3,random_state=1)

###支持向量机---inno
model=SVR(kernel='rbf')
scaler=StandardScaler()
scaler.fit(x_train)
x_train_s=scaler.transform(x_train)
x_test_s=scaler.transform(x_test)
model.fit(x_train_s, lp_train)
#输出拟合优度
print(model.score(x_train_s, lp_train))
print(model.score(x_test_s, lp_test))
#得到模型的所有参数
model.get_params()
#通过迭代得到最优模型
param_grid={'C':[0.01,0.1,1,10,50,100,150],'epsilon':[0.01,0.1,1,10],'gamma':[0.01,0.1,1,10]}
kfold=KFold(n_splits=10,shuffle=(True),random_state=(1))
model=GridSearchCV(SVR(), param_grid,cv=kfold)
model.fit(x_train_s,lp_train)
model=model.best_estimator_
#输出最优模型的拟合优度
print(model.score(x_train_s, lp_train))
print(model.score(x_test_s, lp_test))
#绘制部分依赖图
fig1=PartialDependenceDisplay.from_estimator(model,x_train_s,[1], feature_names=["INNO","ENG","CS","TC","INC"])
ax = fig1.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('恩格尔系数',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.tight_layout()
plt.savefig('fig1.jpg',dpi=300)
plt.close()

fig2=PartialDependenceDisplay.from_estimator(model,x_train_s,[2], feature_names=["INNO","ENG","CS","TC","INC"])
ax = fig2.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('高端消费占比',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.tight_layout()
plt.savefig('fig2.jpg',dpi=300)
plt.close()

fig3=PartialDependenceDisplay.from_estimator(model,x_train_s,[3], feature_names=["INNO","ENG","CS","TC","INC"])
ax = fig3.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('消费规模',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.tight_layout()
plt.savefig('fig3.jpg',dpi=300)
plt.close()

fig4=PartialDependenceDisplay.from_estimator(model,x_train_s,[4], feature_names=["INNO","ENG","CS","TC","INC"])
ax = fig4.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('收入水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.tight_layout()
plt.savefig('fig4.jpg',dpi=300)
plt.close()

fig5=PartialDependenceDisplay.from_estimator(model,x_train_s,[0], feature_names=["INNO","ENG","CS","TC","INC"])
ax = fig5.axes_
lines = ax[0, 0].lines  
for line in lines:  
    line.set_color('black')    
plt.xlabel('创新水平',fontdict={'family':'SimSun','size':12})
plt.ylabel('部分依赖关系',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.tight_layout()
plt.savefig('fig5.jpg',dpi=300)
plt.close()

#绘制拟合图
pred=model.predict(x_test_s)
plt.scatter(pred, lp_test, alpha=0.6,s=15,c='k')
plt.xlabel('测试集预测值',fontdict={'family':'SimSun','size':12})
plt.ylabel('测试集实际值',fontdict={'family':'SimSun','size':12})
plt.gca().xaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f'))
w=np.linspace(min(pred),max(pred),100)
plt.plot(w,w,'k-')
plt.savefig('fig6.jpg',dpi=300)
plt.close()