import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 20 #固定字体大小
plt.rcParams['figure.figsize'] = (14, 6) #固定图像大小
plt.rcParams['font.sans-serif']=['SimSun'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号


train = pd.read_csv('train_grade.csv')
evl = pd.read_csv('evl_grade.csv')


tmp = train.pop('name')
tmp = train.pop('5_price')
tmp = train.pop('size')
tmp = train.pop('city_name')
y_train = train.pop('6_price')


tmp = evl.pop('name')
tmp = evl.pop('6_price')
tmp = evl.pop('size')
tmp = evl.pop('city_name')
y_evl = evl.pop('5_price')


train['const'] = 1 # 加上一列常数
evl['const'] = 1
train


x = train.to_numpy()
y = y_train.to_numpy()
x1 = evl.to_numpy()
y1 = y_evl.to_numpy()
n = x.shape[0]
W = np.array([np.random.rand()-0.5 for x in range(8)]) # W 初始化为 0 向量
W = W.reshape(-1, 1)
L = lambda: .5 * np.power(np.linalg.norm(y1 - x1 @ W), 2) # 对验证集求loss
print('初始loss: {}'.format(L()))
epoch = 10000
mn = 1e15
best = W
for T in range(epoch): # 对5,6月数据进行训练
    for i in np.random.permutation(range(n)):  # 随机排列
        tx = x[i].reshape(1, -1)
        ty = y[i].reshape(1, -1)
        d = tx.T @ (ty - tx @ W) # 计算下降方向
        d /= np.linalg.norm(d) # 单位化
        W += d
    for i in np.random.permutation(range(x1.shape[0])):
        tx = x1[i].reshape(1, -1)
        ty = y1[i].reshape(1, -1)
        d = tx.T @ (ty - tx @ W) # 计算下降方向
        d /= np.linalg.norm(d) # 单位化
        W += d
    if (L() < mn):
        mn = L()
        best = W.copy()
n = x1.shape[0]
print(L())
print(mn)
print('best=\n{}\nW=\n{}'.format(best, W))
W = best

初始loss: 734856210123.7843
35739356096.42686
35207133070.37157
best=
[[ 249.47644353]
 [ 328.66830064]
 [ 446.52438742]
 [ 629.29978699]
 [  62.1182435 ]
 [ 456.43293626]
 [-263.05799612]
 [9889.82129488]]
W=
[[  225.15683901]
 [  290.40582281]
 [  421.29290012]
 [  606.13554503]
 [   37.29548009]
 [  464.22675135]
 [ -275.49544078]
 [10319.38296581]]


prop = np.sign(W) * np.log(np.abs(W))
prop = prop.reshape(-1,)
prop_dict = dict(zip(train.columns, prop))
for key in prop_dict:
    print('{}: {:.2f}'.format(key, prop_dict[key]))

metro: 5.52
bus: 5.80
school: 6.10
hospital: 6.44
shop: 4.13
city: 6.12
year: -5.57
const: 9.20


tmp = prop_dict.pop('year')
tmp = prop_dict.pop('const')


tot = sum(prop_dict.values())
for key in prop_dict:
    if prop_dict[key] < 0 or key == 'const':
        continue
    prop_dict[key] /= tot
    prop_dict[key] += 1e-5
    print('{}: {:.2%}'.format(key, prop_dict[key]))

metro: 16.18%
bus: 16.99%
school: 17.89%
hospital: 18.89%
shop: 12.11%
city: 17.95%


print(sum(prop_dict.values()))

1.0000600000000002


plt.figure(figsize=(8, 8))
labels = ['地铁', '公交', '学校', '医院', '超市', '城区']
plt.pie(prop_dict.values(), explode=[0.01 for x in range(6)], labels=labels, autopct='%.2f%%', colors=['#FCA5A5', '#FCD34D', '#BEF264', '#7DD3FC', '#D8B4FE', '#F9A8D4'])
plt.savefig('系数占比_饼图_改进.pdf')
plt.show()


predict = evl.to_numpy() @ W
ans = y_evl.to_numpy().reshape(-1, 1)
pred_df = pd.DataFrame((np.round((predict - ans) / ans * 1e4)) / 1e2)


pred_df.describe()


# 统计在-10%~10%之间的
sum(((pred_df >= -10) & (pred_df <= 10)).to_numpy())

array([42])


ax1 = pred_df.loc[:, 0].plot(kind='hist', bins=30, grid=True)
labels = []
for x in range(-41, 101):
    if x % 20 == 0:
        labels.append(r'{}%'.format(x))
plt.xticks([x for x in range(-41, 101) if x % 20 == 0], labels=labels)
plt.axis([-40, 50, 0, 9])
ax1.set_xlabel('准确率')
ax1.set_ylabel('个数')

ax2 = ax1.twinx()
pred_df.loc[:, 0].plot(kind = 'kde', color='orange', ax=ax2, label='概率密度分布')
ax2.set_yticks([])
ax2.set_ylabel('')
plt.legend()

plt.savefig('准确率分布_改进.pdf')
plt.show()

	metro	bus	school	hospital	shop	city	year	const
0	4.0	3.0	3.0	4.0	3.0	4	10.0	1
1	3.0	2.0	5.0	4.0	4.0	4	18.0	1
2	4.0	3.0	4.0	2.0	4.0	4	5.0	1
3	3.0	4.0	3.0	3.0	4.0	4	7.0	1
4	4.0	3.0	2.0	4.0	3.0	4	5.0	1
...	...	...	...	...	...	...	...	...
234	5.0	5.0	4.0	5.0	4.0	4	9.0	1
235	5.0	5.0	4.0	4.0	4.0	4	15.0	1
236	4.0	4.0	3.0	4.0	4.0	4	6.0	1
237	1.0	5.0	5.0	4.0	4.0	4	6.0	1
238	2.0	4.0	2.0	3.0	3.0	4	3.0	1

	0
count	79.000000
mean	2.409873
std	15.918829
min	-31.170000
25%	-6.610000
50%	2.760000
75%	9.635000
max	39.540000