In [1]:
import numpy as np import pandas as pd import matplotlib.pyplot as plt plt.rcParams['font.size'] = 20 #固定字体大小 plt.rcParams['figure.figsize'] = (14, 6) #固定图像大小 plt.rcParams['font.sans-serif']=['SimSun'] #用来正常显示中文标签 plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
In [2]:
train = pd.read_csv('train_grade.csv') evl = pd.read_csv('evl_grade.csv')
In [3]:
tmp = train.pop('name') tmp = train.pop('5_price') tmp = train.pop('size') tmp = train.pop('city_name') y_train = train.pop('6_price')
In [4]:
tmp = evl.pop('name') tmp = evl.pop('6_price') tmp = evl.pop('size') tmp = evl.pop('city_name') y_evl = evl.pop('5_price')
In [5]:
train['const'] = 1 # 加上一列常数 evl['const'] = 1 train
Out[5]:
metro | bus | school | hospital | shop | city | year | const | |
---|---|---|---|---|---|---|---|---|
0 | 4.0 | 3.0 | 3.0 | 4.0 | 3.0 | 4 | 10.0 | 1 |
1 | 3.0 | 2.0 | 5.0 | 4.0 | 4.0 | 4 | 18.0 | 1 |
2 | 4.0 | 3.0 | 4.0 | 2.0 | 4.0 | 4 | 5.0 | 1 |
3 | 3.0 | 4.0 | 3.0 | 3.0 | 4.0 | 4 | 7.0 | 1 |
4 | 4.0 | 3.0 | 2.0 | 4.0 | 3.0 | 4 | 5.0 | 1 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
234 | 5.0 | 5.0 | 4.0 | 5.0 | 4.0 | 4 | 9.0 | 1 |
235 | 5.0 | 5.0 | 4.0 | 4.0 | 4.0 | 4 | 15.0 | 1 |
236 | 4.0 | 4.0 | 3.0 | 4.0 | 4.0 | 4 | 6.0 | 1 |
237 | 1.0 | 5.0 | 5.0 | 4.0 | 4.0 | 4 | 6.0 | 1 |
238 | 2.0 | 4.0 | 2.0 | 3.0 | 3.0 | 4 | 3.0 | 1 |
239 rows × 8 columns
In [6]:
x = train.to_numpy() y = y_train.to_numpy() x1 = evl.to_numpy() y1 = y_evl.to_numpy() n = x.shape[0] W = np.array([np.random.rand()-0.5 for x in range(8)]) # W 初始化为 0 向量 W = W.reshape(-1, 1) L = lambda: .5 * np.power(np.linalg.norm(y1 - x1 @ W), 2) # 对验证集求loss print('初始loss: {}'.format(L())) epoch = 10000 mn = 1e15 best = W for T in range(epoch): # 对5,6月数据进行训练 for i in np.random.permutation(range(n)): # 随机排列 tx = x[i].reshape(1, -1) ty = y[i].reshape(1, -1) d = tx.T @ (ty - tx @ W) # 计算下降方向 d /= np.linalg.norm(d) # 单位化 W += d for i in np.random.permutation(range(x1.shape[0])): tx = x1[i].reshape(1, -1) ty = y1[i].reshape(1, -1) d = tx.T @ (ty - tx @ W) # 计算下降方向 d /= np.linalg.norm(d) # 单位化 W += d if (L() < mn): mn = L() best = W.copy() n = x1.shape[0] print(L()) print(mn) print('best=\n{}\nW=\n{}'.format(best, W)) W = best
初始loss: 734856210123.7843 35739356096.42686 35207133070.37157 best= [[ 249.47644353] [ 328.66830064] [ 446.52438742] [ 629.29978699] [ 62.1182435 ] [ 456.43293626] [-263.05799612] [9889.82129488]] W= [[ 225.15683901] [ 290.40582281] [ 421.29290012] [ 606.13554503] [ 37.29548009] [ 464.22675135] [ -275.49544078] [10319.38296581]]
In [7]:
prop = np.sign(W) * np.log(np.abs(W)) prop = prop.reshape(-1,) prop_dict = dict(zip(train.columns, prop)) for key in prop_dict: print('{}: {:.2f}'.format(key, prop_dict[key]))
metro: 5.52 bus: 5.80 school: 6.10 hospital: 6.44 shop: 4.13 city: 6.12 year: -5.57 const: 9.20
下面计算正项系数的百分比,不包括负系数和常数项.
In [8]:
tmp = prop_dict.pop('year') tmp = prop_dict.pop('const')
In [9]:
tot = sum(prop_dict.values()) for key in prop_dict: if prop_dict[key] < 0 or key == 'const': continue prop_dict[key] /= tot prop_dict[key] += 1e-5 print('{}: {:.2%}'.format(key, prop_dict[key]))
metro: 16.18% bus: 16.99% school: 17.89% hospital: 18.89% shop: 12.11% city: 17.95%
In [10]:
print(sum(prop_dict.values()))
1.0000600000000002
In [11]:
plt.figure(figsize=(8, 8)) labels = ['地铁', '公交', '学校', '医院', '超市', '城区'] plt.pie(prop_dict.values(), explode=[0.01 for x in range(6)], labels=labels, autopct='%.2f%%', colors=['#FCA5A5', '#FCD34D', '#BEF264', '#7DD3FC', '#D8B4FE', '#F9A8D4']) plt.savefig('系数占比_饼图_改进.pdf') plt.show()
In [12]:
predict = evl.to_numpy() @ W ans = y_evl.to_numpy().reshape(-1, 1) pred_df = pd.DataFrame((np.round((predict - ans) / ans * 1e4)) / 1e2)
In [13]:
pred_df.describe()
Out[13]:
0 | |
---|---|
count | 79.000000 |
mean | 2.409873 |
std | 15.918829 |
min | -31.170000 |
25% | -6.610000 |
50% | 2.760000 |
75% | 9.635000 |
max | 39.540000 |
In [14]:
# 统计在-10%~10%之间的 sum(((pred_df >= -10) & (pred_df <= 10)).to_numpy())
Out[14]:
array([42])
In [15]:
ax1 = pred_df.loc[:, 0].plot(kind='hist', bins=30, grid=True) labels = [] for x in range(-41, 101): if x % 20 == 0: labels.append(r'{}%'.format(x)) plt.xticks([x for x in range(-41, 101) if x % 20 == 0], labels=labels) plt.axis([-40, 50, 0, 9]) ax1.set_xlabel('准确率') ax1.set_ylabel('个数') ax2 = ax1.twinx() pred_df.loc[:, 0].plot(kind = 'kde', color='orange', ax=ax2, label='概率密度分布') ax2.set_yticks([]) ax2.set_ylabel('') plt.legend() plt.savefig('准确率分布_改进.pdf') plt.show()