import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import kstest
plt.rcParams['font.size'] = 20 #固定字体大小
plt.rcParams['figure.figsize'] = (14, 6) #固定图像大小
plt.rcParams['font.sans-serif']=['SimSun'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号


df = pd.read_excel('data.xlsx')
n, m = df.shape
type(df.iloc[1])

pandas.core.series.Series


df.dtypes

name         object
5_price     float64
6_price       int64
size        float64
metro       float64
bus         float64
school      float64
hospital    float64
shop        float64
city          int64
year        float64
dtype: object


# 处理计算房龄
for i in range(n):
    x = df.loc[i, 'year']
    if pd.isna(x):
        continue
    if x > 1000:
        x = 2022 - x
    else:
        if 80 <= x <= 99:
            x -= 100
        x = 22 - x
    # print(x)
    df.loc[i, 'year'] = x


print('没有房龄={}, 没有面积={}'.format(df['year'].isna().sum(), df['size'].isna().sum()))
print('既没有房龄也没有面积={}'.format((df['year'].isna() & df['size'].isna()).sum()))
print('既没有房龄或者没有面积={}'.format((df['year'].isna() | df['size'].isna()).sum()))

没有房龄=33, 没有面积=40
既没有房龄也没有面积=31
既没有房龄或者没有面积=42


df = df[-(df['year'].isna() | df['size'].isna())]


city_name = ['未央区', '雁塔区', '莲湖区', '碑林区', '新城区', '灞桥区', '长安区', '高新区', '经开区',
             '浐灞生态区', '国际港务区', '西咸新区']
evl = df[-df['5_price'].isna()].reset_index()
n, m = evl.shape
cnt = [0] * 12
for i in range(n):
    cnt[evl.loc[i, 'city']] += 1
print('地区: 用于检验的个数')
for i in range(12):
    print('{}: {}'.format(city_name[i], cnt[i]))
print('总计: {}'.format(sum(cnt)))

地区: 用于检验的个数
未央区: 6
雁塔区: 7
莲湖区: 6
碑林区: 7
新城区: 6
灞桥区: 7
长安区: 7
高新区: 7
经开区: 6
浐灞生态区: 6
国际港务区: 7
西咸新区: 7
总计: 79


train = df[df['5_price'].isna()].reset_index()


print('训练数据个数={}, 检验数据个数={}, 总计={}'.format(train.shape[0], evl.shape[0], df.shape[0]))
df.describe()

训练数据个数=239, 检验数据个数=79, 总计=318


train.to_csv('train.csv')
evl.to_csv('evaluation.csv')


# k-s测试
def ks_test(name):
    return kstest(train[name].to_numpy(), 'norm', (train[name].mean(), train[name].std()))
print(ks_test('metro'))
print(ks_test('bus'))
print(ks_test('school'))
print(ks_test('hospital'))
print(ks_test('shop'))

KstestResult(statistic=0.14509115603493106, pvalue=7.44853726739269e-05)
KstestResult(statistic=0.0886852172244787, pvalue=0.043775126850685186)
KstestResult(statistic=0.07806724132243936, pvalue=0.10295009934945765)
KstestResult(statistic=0.18220505774787032, pvalue=2.0450720133654144e-07)
KstestResult(statistic=0.1024330554958281, pvalue=0.012303964920067209)


plt.rcParams['font.size'] = 40 #固定字体大小
plt.rcParams['figure.figsize'] = (14, 10) #固定图像大小
ax1 = train['metro'].plot(kind = 'hist', bins=50, grid=True)
plt.axis([0, 6000, 0, 30])
plt.xticks([x for x in range(1, 6000) if x % 1000 == 0])
ax1.set_xlabel('地铁距离')
ax1.set_ylabel('个数')

ax2 = ax1.twinx()
train['metro'].plot(kind = 'kde', color='orange', label='概率密度分布', ax=ax2)
ax2.set_yticks([])
ax2.set_ylabel('')
plt.legend()

plt.savefig('metro.pdf')
plt.show()


ax1 = train['bus'].plot(kind = 'hist', bins=50, grid=True)
plt.axis([0, 2400, 0, 23])
plt.xticks([x for x in range(1, 2401) if x % 500 == 0])
ax1.set_xlabel('公交站距离')
ax1.set_ylabel('个数')

ax2 = ax1.twinx()
train['bus'].plot(kind = 'kde', color='orange', label='概率密度分布', ax=ax2)
ax2.set_yticks([])
ax2.set_ylabel('')
plt.legend()

plt.savefig('bus.pdf')
plt.show()


ax1 = train['school'].plot(kind = 'hist', bins=50, grid=True)
plt.axis([0, 2100, 0, 17])
plt.xticks([x for x in range(1, 2001) if x % 500 == 0])
ax1.set_xlabel('学校距离')
ax1.set_ylabel('个数')

ax2 = ax1.twinx()
ax2 = train['school'].plot(kind = 'kde', color='orange', label='概率密度分布')
ax2.set_yticks([])
ax2.set_ylabel('')
plt.legend()

plt.savefig('school.pdf')
plt.show()


ax1 = train['hospital'].plot(kind = 'hist', bins=50, grid=True)
plt.axis([0, 5000, 0, 25])
plt.xticks([x for x in range(1, 5001) if x % 1000 == 0])
ax1.set_xlabel('医院距离')
ax1.set_ylabel('个数')

ax2 = ax1.twinx()
ax2 = train['hospital'].plot(kind = 'kde', color='orange', label='概率密度分布')
ax2.set_yticks([])
ax2.set_ylabel('')
plt.legend()

plt.savefig('hospital.pdf')
plt.show()


ax1 = train['shop'].plot(kind = 'hist', bins=50, grid=True)
plt.axis([0, 2000, 0, 20])
plt.xticks([x for x in range(1, 2001) if x % 300 == 0])
ax1.set_xlabel('超市距离')
ax1.set_ylabel('个数')

ax2 = ax1.twinx()
ax2 = train['shop'].plot(kind = 'kde', color='orange', label='概率密度分布')
ax2.set_yticks([])
ax2.set_ylabel('')
plt.legend()

plt.savefig('shop.pdf')
plt.show()


ax1 = train['year'].plot(kind = 'hist', bins=50, grid=True)
plt.axis([0, 26, 0, 30])
plt.xticks([x for x in range(1, 30) if x % 5 == 0])
ax1.set_xlabel('房龄')
ax1.set_ylabel('个数')

ax2 = ax1.twinx()
ax2 = train['year'].plot(kind = 'kde', color='orange', label='概率密度分布')
ax2.set_yticks([])
ax2.set_ylabel('')
plt.legend()

plt.savefig('year.pdf')
plt.show()


ax1 = train['size'].plot(kind = 'hist', bins=50, grid=True)
plt.axis([0, 350, 0, 28])
plt.xticks([x for x in range(1, 400) if x % 50 == 0])
ax1.set_xlabel('建筑面积')
ax1.set_ylabel('个数')

ax2 = ax1.twinx()
ax2 = train['size'].plot(kind = 'kde', color='orange', label='概率密度分布')
ax2.set_yticks([])
ax2.set_ylabel('')
plt.legend()

plt.savefig('size.pdf')
plt.show()


def grade(mean, std):  # mean:平均值，std:标准差，用于计算等级分划
    ret = []
    ret.append([0, max(0, mean - std)]) # 5级
    ret.append([max(0, mean - std), mean]) # 4级
    ret.append([mean, mean + std]) # 3级
    ret.append([mean + std, mean + std * 2]) # 2级
    ret.append([mean + std * 2, 1e5]) # 1级
    return ret
def print_grade(name, div):
    print('{}等级划分为: '.format(name), end='')
    for i in range(4):
        print('{:.2f}~{:.2f}, '.format(div[i][0], div[i][1]), end='')
    print('{:.2f}~infty'.format(div[4][0]))
show = lambda name: print_grade(name, grade(df[name].mean(), df[name].std()))

cols = ['size', 'metro', 'bus', 'school', 'hospital', 'shop', 'year']
for col in cols:
    show(col)

size等级划分为: 0.00~88.23, 88.23~119.42, 119.42~150.62, 150.62~181.81, 181.81~infty
metro等级划分为: 0.00~270.03, 270.03~981.22, 981.22~1692.40, 1692.40~2403.58, 2403.58~infty
bus等级划分为: 0.00~230.90, 230.90~515.02, 515.02~799.14, 799.14~1083.26, 1083.26~infty
school等级划分为: 0.00~290.68, 290.68~633.87, 633.87~977.07, 977.07~1320.27, 1320.27~infty
hospital等级划分为: 0.00~159.11, 159.11~957.61, 957.61~1756.11, 1756.11~2554.61, 2554.61~infty
shop等级划分为: 0.00~165.71, 165.71~505.90, 505.90~846.10, 846.10~1186.30, 1186.30~infty
year等级划分为: 0.00~3.96, 3.96~8.86, 8.86~13.75, 13.75~18.65, 18.65~infty


df.describe()


city_name = ['未央区', '雁塔区', '莲湖区', '碑林区', '新城区', '灞桥区', '长安区', '高新区', '经开区',
             '浐灞生态区', '国际港务区', '西咸新区']
city_rank = [4, 5, 2, 4, 2, 1, 3, 5, 3, 3, 3, 4]
def calc_grade(name, x):
    div = grade(df[name].mean(), df[name].std())
    for i in range(4):
        if div[i][0] <= x <= div[i][1]:
            return 5 - i
    return 1
def convert(df):
    n = df.shape[0]
    for col in cols:
        if col == 'year':
            continue
        for i in range(n):
            df.loc[i, col] = calc_grade(col, df.loc[i, col])
    for i in range(n):  # 特殊处理 city 列
        df.loc[i, 'city_name'] = city_name[df.loc[i, 'city']]
        df.loc[i, 'city'] = city_rank[df.loc[i, 'city']]
    return df
train_grade = convert(train.copy())
evl_grade = convert(evl.copy())
tmp = train_grade.pop('index')
tmp = evl_grade.pop('index')


train_grade.to_csv('train_grade.csv', index=False)
evl_grade.to_csv('evl_grade.csv', index=False)


# 全部的数据都转化为等级
df = df.reset_index()
tmp = df.pop('index')
df_grade = convert(df.copy())
df_grade


df_grade.to_csv('all_grade.csv', index=False)

	5_price	6_price	size	metro	bus	school	hospital	shop	city	year
count	79.000000	318.000000	318.000000	318.000000	318.000000	318.000000	318.000000	318.000000	318.000000	318.000000
mean	15053.594937	15961.877358	119.421447	981.216981	515.022013	633.874214	957.610063	505.902516	5.679245	8.855346
std	3007.748142	4330.880408	31.193733	711.182323	284.119598	343.198506	798.500147	340.196742	3.411468	4.899413
min	8205.000000	7142.000000	38.000000	103.000000	44.000000	33.000000	61.000000	11.000000	0.000000	1.000000
25%	13236.000000	13451.750000	100.542500	544.250000	306.750000	367.250000	455.500000	244.500000	3.000000	5.000000
50%	14952.000000	15285.000000	117.070000	814.500000	464.000000	587.000000	768.500000	451.000000	6.000000	8.000000
75%	16482.000000	17626.250000	133.622500	1170.500000	673.750000	833.500000	1110.750000	695.000000	9.000000	12.000000
max	23040.000000	42113.000000	320.820000	5800.000000	2274.000000	2000.000000	5800.000000	1913.000000	11.000000	24.000000

	5_price	6_price	size	metro	bus	school	hospital	shop	city	year
count	79.000000	318.000000	318.000000	318.000000	318.000000	318.000000	318.000000	318.000000	318.000000	318.000000
mean	15053.594937	15961.877358	119.421447	981.216981	515.022013	633.874214	957.610063	505.902516	5.679245	8.855346
std	3007.748142	4330.880408	31.193733	711.182323	284.119598	343.198506	798.500147	340.196742	3.411468	4.899413
min	8205.000000	7142.000000	38.000000	103.000000	44.000000	33.000000	61.000000	11.000000	0.000000	1.000000
25%	13236.000000	13451.750000	100.542500	544.250000	306.750000	367.250000	455.500000	244.500000	3.000000	5.000000
50%	14952.000000	15285.000000	117.070000	814.500000	464.000000	587.000000	768.500000	451.000000	6.000000	8.000000
75%	16482.000000	17626.250000	133.622500	1170.500000	673.750000	833.500000	1110.750000	695.000000	9.000000	12.000000
max	23040.000000	42113.000000	320.820000	5800.000000	2274.000000	2000.000000	5800.000000	1913.000000	11.000000	24.000000

	name	5_price	6_price	size	metro	bus	school	hospital	shop	city	year	city_name
0	恒大名都	NaN	13808	4.0	4.0	3.0	3.0	4.0	3.0	4	10.0	未央区
1	锦园新世纪花园社区	NaN	15822	3.0	3.0	2.0	5.0	4.0	4.0	4	18.0	未央区
2	万科金色悦城	NaN	15352	5.0	4.0	3.0	4.0	2.0	4.0	4	5.0	未央区
3	绿地香树花城	NaN	15345	5.0	3.0	4.0	3.0	3.0	4.0	4	7.0	未央区
4	万科金域东郡	NaN	16539	1.0	4.0	3.0	2.0	4.0	3.0	4	5.0	未央区
...	...	...	...	...	...	...	...	...	...	...	...	...
313	世纪锦城	NaN	12373	3.0	5.0	5.0	4.0	5.0	4.0	4	9.0	西咸新区
314	怡景华庭	NaN	16630	4.0	5.0	5.0	4.0	4.0	4.0	4	15.0	西咸新区
315	奥林匹克花园二期	NaN	14491	3.0	4.0	4.0	3.0	4.0	4.0	4	6.0	西咸新区
316	同德晨曦园	NaN	12443	4.0	1.0	5.0	5.0	4.0	4.0	4	6.0	西咸新区
317	紫薇万科大都会	NaN	15884	4.0	2.0	4.0	2.0	3.0	3.0	4	3.0	西咸新区