In [1]:
import numpy as np import pandas as pd import matplotlib.pyplot as plt from scipy.stats import kstest plt.rcParams['font.size'] = 20 #固定字体大小 plt.rcParams['figure.figsize'] = (14, 6) #固定图像大小 plt.rcParams['font.sans-serif']=['SimSun'] #用来正常显示中文标签 plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
In [2]:
df = pd.read_excel('data.xlsx') n, m = df.shape type(df.iloc[1])
Out[2]:
pandas.core.series.Series
In [3]:
df.dtypes
Out[3]:
name object 5_price float64 6_price int64 size float64 metro float64 bus float64 school float64 hospital float64 shop float64 city int64 year float64 dtype: object
In [4]:
# 处理计算房龄 for i in range(n): x = df.loc[i, 'year'] if pd.isna(x): continue if x > 1000: x = 2022 - x else: if 80 <= x <= 99: x -= 100 x = 22 - x # print(x) df.loc[i, 'year'] = x
In [5]:
print('没有房龄={}, 没有面积={}'.format(df['year'].isna().sum(), df['size'].isna().sum())) print('既没有房龄也没有面积={}'.format((df['year'].isna() & df['size'].isna()).sum())) print('既没有房龄或者没有面积={}'.format((df['year'].isna() | df['size'].isna()).sum()))
没有房龄=33, 没有面积=40 既没有房龄也没有面积=31 既没有房龄或者没有面积=42
In [6]:
df = df[-(df['year'].isna() | df['size'].isna())]
In [7]:
city_name = ['未央区', '雁塔区', '莲湖区', '碑林区', '新城区', '灞桥区', '长安区', '高新区', '经开区', '浐灞生态区', '国际港务区', '西咸新区'] evl = df[-df['5_price'].isna()].reset_index() n, m = evl.shape cnt = [0] * 12 for i in range(n): cnt[evl.loc[i, 'city']] += 1 print('地区: 用于检验的个数') for i in range(12): print('{}: {}'.format(city_name[i], cnt[i])) print('总计: {}'.format(sum(cnt)))
地区: 用于检验的个数 未央区: 6 雁塔区: 7 莲湖区: 6 碑林区: 7 新城区: 6 灞桥区: 7 长安区: 7 高新区: 7 经开区: 6 浐灞生态区: 6 国际港务区: 7 西咸新区: 7 总计: 79
In [8]:
train = df[df['5_price'].isna()].reset_index()
In [9]:
print('训练数据个数={}, 检验数据个数={}, 总计={}'.format(train.shape[0], evl.shape[0], df.shape[0])) df.describe()
训练数据个数=239, 检验数据个数=79, 总计=318
Out[9]:
5_price | 6_price | size | metro | bus | school | hospital | shop | city | year | |
---|---|---|---|---|---|---|---|---|---|---|
count | 79.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 |
mean | 15053.594937 | 15961.877358 | 119.421447 | 981.216981 | 515.022013 | 633.874214 | 957.610063 | 505.902516 | 5.679245 | 8.855346 |
std | 3007.748142 | 4330.880408 | 31.193733 | 711.182323 | 284.119598 | 343.198506 | 798.500147 | 340.196742 | 3.411468 | 4.899413 |
min | 8205.000000 | 7142.000000 | 38.000000 | 103.000000 | 44.000000 | 33.000000 | 61.000000 | 11.000000 | 0.000000 | 1.000000 |
25% | 13236.000000 | 13451.750000 | 100.542500 | 544.250000 | 306.750000 | 367.250000 | 455.500000 | 244.500000 | 3.000000 | 5.000000 |
50% | 14952.000000 | 15285.000000 | 117.070000 | 814.500000 | 464.000000 | 587.000000 | 768.500000 | 451.000000 | 6.000000 | 8.000000 |
75% | 16482.000000 | 17626.250000 | 133.622500 | 1170.500000 | 673.750000 | 833.500000 | 1110.750000 | 695.000000 | 9.000000 | 12.000000 |
max | 23040.000000 | 42113.000000 | 320.820000 | 5800.000000 | 2274.000000 | 2000.000000 | 5800.000000 | 1913.000000 | 11.000000 | 24.000000 |
In [10]:
train.to_csv('train.csv') evl.to_csv('evaluation.csv')
In [11]:
# k-s测试 def ks_test(name): return kstest(train[name].to_numpy(), 'norm', (train[name].mean(), train[name].std())) print(ks_test('metro')) print(ks_test('bus')) print(ks_test('school')) print(ks_test('hospital')) print(ks_test('shop'))
KstestResult(statistic=0.14509115603493106, pvalue=7.44853726739269e-05) KstestResult(statistic=0.0886852172244787, pvalue=0.043775126850685186) KstestResult(statistic=0.07806724132243936, pvalue=0.10295009934945765) KstestResult(statistic=0.18220505774787032, pvalue=2.0450720133654144e-07) KstestResult(statistic=0.1024330554958281, pvalue=0.012303964920067209)
In [12]:
plt.rcParams['font.size'] = 40 #固定字体大小 plt.rcParams['figure.figsize'] = (14, 10) #固定图像大小 ax1 = train['metro'].plot(kind = 'hist', bins=50, grid=True) plt.axis([0, 6000, 0, 30]) plt.xticks([x for x in range(1, 6000) if x % 1000 == 0]) ax1.set_xlabel('地铁距离') ax1.set_ylabel('个数') ax2 = ax1.twinx() train['metro'].plot(kind = 'kde', color='orange', label='概率密度分布', ax=ax2) ax2.set_yticks([]) ax2.set_ylabel('') plt.legend() plt.savefig('metro.pdf') plt.show()
In [13]:
ax1 = train['bus'].plot(kind = 'hist', bins=50, grid=True) plt.axis([0, 2400, 0, 23]) plt.xticks([x for x in range(1, 2401) if x % 500 == 0]) ax1.set_xlabel('公交站距离') ax1.set_ylabel('个数') ax2 = ax1.twinx() train['bus'].plot(kind = 'kde', color='orange', label='概率密度分布', ax=ax2) ax2.set_yticks([]) ax2.set_ylabel('') plt.legend() plt.savefig('bus.pdf') plt.show()
In [14]:
ax1 = train['school'].plot(kind = 'hist', bins=50, grid=True) plt.axis([0, 2100, 0, 17]) plt.xticks([x for x in range(1, 2001) if x % 500 == 0]) ax1.set_xlabel('学校距离') ax1.set_ylabel('个数') ax2 = ax1.twinx() ax2 = train['school'].plot(kind = 'kde', color='orange', label='概率密度分布') ax2.set_yticks([]) ax2.set_ylabel('') plt.legend() plt.savefig('school.pdf') plt.show()
In [15]:
ax1 = train['hospital'].plot(kind = 'hist', bins=50, grid=True) plt.axis([0, 5000, 0, 25]) plt.xticks([x for x in range(1, 5001) if x % 1000 == 0]) ax1.set_xlabel('医院距离') ax1.set_ylabel('个数') ax2 = ax1.twinx() ax2 = train['hospital'].plot(kind = 'kde', color='orange', label='概率密度分布') ax2.set_yticks([]) ax2.set_ylabel('') plt.legend() plt.savefig('hospital.pdf') plt.show()
In [16]:
ax1 = train['shop'].plot(kind = 'hist', bins=50, grid=True) plt.axis([0, 2000, 0, 20]) plt.xticks([x for x in range(1, 2001) if x % 300 == 0]) ax1.set_xlabel('超市距离') ax1.set_ylabel('个数') ax2 = ax1.twinx() ax2 = train['shop'].plot(kind = 'kde', color='orange', label='概率密度分布') ax2.set_yticks([]) ax2.set_ylabel('') plt.legend() plt.savefig('shop.pdf') plt.show()
In [17]:
ax1 = train['year'].plot(kind = 'hist', bins=50, grid=True) plt.axis([0, 26, 0, 30]) plt.xticks([x for x in range(1, 30) if x % 5 == 0]) ax1.set_xlabel('房龄') ax1.set_ylabel('个数') ax2 = ax1.twinx() ax2 = train['year'].plot(kind = 'kde', color='orange', label='概率密度分布') ax2.set_yticks([]) ax2.set_ylabel('') plt.legend() plt.savefig('year.pdf') plt.show()
In [18]:
ax1 = train['size'].plot(kind = 'hist', bins=50, grid=True) plt.axis([0, 350, 0, 28]) plt.xticks([x for x in range(1, 400) if x % 50 == 0]) ax1.set_xlabel('建筑面积') ax1.set_ylabel('个数') ax2 = ax1.twinx() ax2 = train['size'].plot(kind = 'kde', color='orange', label='概率密度分布') ax2.set_yticks([]) ax2.set_ylabel('') plt.legend() plt.savefig('size.pdf') plt.show()
In [19]:
def grade(mean, std): # mean:平均值,std:标准差,用于计算等级分划 ret = [] ret.append([0, max(0, mean - std)]) # 5级 ret.append([max(0, mean - std), mean]) # 4级 ret.append([mean, mean + std]) # 3级 ret.append([mean + std, mean + std * 2]) # 2级 ret.append([mean + std * 2, 1e5]) # 1级 return ret def print_grade(name, div): print('{}等级划分为: '.format(name), end='') for i in range(4): print('{:.2f}~{:.2f}, '.format(div[i][0], div[i][1]), end='') print('{:.2f}~infty'.format(div[4][0])) show = lambda name: print_grade(name, grade(df[name].mean(), df[name].std())) cols = ['size', 'metro', 'bus', 'school', 'hospital', 'shop', 'year'] for col in cols: show(col)
size等级划分为: 0.00~88.23, 88.23~119.42, 119.42~150.62, 150.62~181.81, 181.81~infty metro等级划分为: 0.00~270.03, 270.03~981.22, 981.22~1692.40, 1692.40~2403.58, 2403.58~infty bus等级划分为: 0.00~230.90, 230.90~515.02, 515.02~799.14, 799.14~1083.26, 1083.26~infty school等级划分为: 0.00~290.68, 290.68~633.87, 633.87~977.07, 977.07~1320.27, 1320.27~infty hospital等级划分为: 0.00~159.11, 159.11~957.61, 957.61~1756.11, 1756.11~2554.61, 2554.61~infty shop等级划分为: 0.00~165.71, 165.71~505.90, 505.90~846.10, 846.10~1186.30, 1186.30~infty year等级划分为: 0.00~3.96, 3.96~8.86, 8.86~13.75, 13.75~18.65, 18.65~infty
转化表格为等级,其中 year
不进行转换.
In [20]:
df.describe()
Out[20]:
5_price | 6_price | size | metro | bus | school | hospital | shop | city | year | |
---|---|---|---|---|---|---|---|---|---|---|
count | 79.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 | 318.000000 |
mean | 15053.594937 | 15961.877358 | 119.421447 | 981.216981 | 515.022013 | 633.874214 | 957.610063 | 505.902516 | 5.679245 | 8.855346 |
std | 3007.748142 | 4330.880408 | 31.193733 | 711.182323 | 284.119598 | 343.198506 | 798.500147 | 340.196742 | 3.411468 | 4.899413 |
min | 8205.000000 | 7142.000000 | 38.000000 | 103.000000 | 44.000000 | 33.000000 | 61.000000 | 11.000000 | 0.000000 | 1.000000 |
25% | 13236.000000 | 13451.750000 | 100.542500 | 544.250000 | 306.750000 | 367.250000 | 455.500000 | 244.500000 | 3.000000 | 5.000000 |
50% | 14952.000000 | 15285.000000 | 117.070000 | 814.500000 | 464.000000 | 587.000000 | 768.500000 | 451.000000 | 6.000000 | 8.000000 |
75% | 16482.000000 | 17626.250000 | 133.622500 | 1170.500000 | 673.750000 | 833.500000 | 1110.750000 | 695.000000 | 9.000000 | 12.000000 |
max | 23040.000000 | 42113.000000 | 320.820000 | 5800.000000 | 2274.000000 | 2000.000000 | 5800.000000 | 1913.000000 | 11.000000 | 24.000000 |
In [21]:
city_name = ['未央区', '雁塔区', '莲湖区', '碑林区', '新城区', '灞桥区', '长安区', '高新区', '经开区', '浐灞生态区', '国际港务区', '西咸新区'] city_rank = [4, 5, 2, 4, 2, 1, 3, 5, 3, 3, 3, 4] def calc_grade(name, x): div = grade(df[name].mean(), df[name].std()) for i in range(4): if div[i][0] <= x <= div[i][1]: return 5 - i return 1 def convert(df): n = df.shape[0] for col in cols: if col == 'year': continue for i in range(n): df.loc[i, col] = calc_grade(col, df.loc[i, col]) for i in range(n): # 特殊处理 city 列 df.loc[i, 'city_name'] = city_name[df.loc[i, 'city']] df.loc[i, 'city'] = city_rank[df.loc[i, 'city']] return df train_grade = convert(train.copy()) evl_grade = convert(evl.copy()) tmp = train_grade.pop('index') tmp = evl_grade.pop('index')
In [22]:
train_grade.to_csv('train_grade.csv', index=False) evl_grade.to_csv('evl_grade.csv', index=False)
In [23]:
# 全部的数据都转化为等级 df = df.reset_index() tmp = df.pop('index') df_grade = convert(df.copy()) df_grade
Out[23]:
name | 5_price | 6_price | size | metro | bus | school | hospital | shop | city | year | city_name | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 恒大名都 | NaN | 13808 | 4.0 | 4.0 | 3.0 | 3.0 | 4.0 | 3.0 | 4 | 10.0 | 未央区 |
1 | 锦园新世纪花园社区 | NaN | 15822 | 3.0 | 3.0 | 2.0 | 5.0 | 4.0 | 4.0 | 4 | 18.0 | 未央区 |
2 | 万科金色悦城 | NaN | 15352 | 5.0 | 4.0 | 3.0 | 4.0 | 2.0 | 4.0 | 4 | 5.0 | 未央区 |
3 | 绿地香树花城 | NaN | 15345 | 5.0 | 3.0 | 4.0 | 3.0 | 3.0 | 4.0 | 4 | 7.0 | 未央区 |
4 | 万科金域东郡 | NaN | 16539 | 1.0 | 4.0 | 3.0 | 2.0 | 4.0 | 3.0 | 4 | 5.0 | 未央区 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
313 | 世纪锦城 | NaN | 12373 | 3.0 | 5.0 | 5.0 | 4.0 | 5.0 | 4.0 | 4 | 9.0 | 西咸新区 |
314 | 怡景华庭 | NaN | 16630 | 4.0 | 5.0 | 5.0 | 4.0 | 4.0 | 4.0 | 4 | 15.0 | 西咸新区 |
315 | 奥林匹克花园二期 | NaN | 14491 | 3.0 | 4.0 | 4.0 | 3.0 | 4.0 | 4.0 | 4 | 6.0 | 西咸新区 |
316 | 同德晨曦园 | NaN | 12443 | 4.0 | 1.0 | 5.0 | 5.0 | 4.0 | 4.0 | 4 | 6.0 | 西咸新区 |
317 | 紫薇万科大都会 | NaN | 15884 | 4.0 | 2.0 | 4.0 | 2.0 | 3.0 | 3.0 | 4 | 3.0 | 西咸新区 |
318 rows × 12 columns
In [24]:
df_grade.to_csv('all_grade.csv', index=False)