常用FE函数(unfinished)

常用的函数

数值统计特征

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
import numpy as np

## 主要计算统计特征
def get_stats(train_df, test_df, target_column, group_column = 'manager_id'):
'''
target_column: numeric columns to group with (e.g. price, bedrooms, bathrooms)
group_column: categorical columns to group on (e.g. manager_id, building_id)
'''
train_df['row_id'] = range(train_df.shape[0])
test_df['row_id'] = range(test_df.shape[0])
train_df['train'] = 1
test_df['train'] = 0
all_df = train_df[['row_id', 'train', target_column, group_column]].append(test_df[['row_id','train',
target_column, group_column]])
grouped = all_df[[target_column, group_column]].groupby(group_column)
the_size = pd.DataFrame(grouped.size()).reset_index()
the_size.columns = [group_column, '%s_size' % target_column]
the_mean = pd.DataFrame(grouped.mean()).reset_index()
the_mean.columns = [group_column, '%s_mean' % target_column]
the_std = pd.DataFrame(grouped.std()).reset_index().fillna(0)
the_std.columns = [group_column, '%s_std' % target_column]
the_median = pd.DataFrame(grouped.median()).reset_index()
the_median.columns = [group_column, '%s_median' % target_column]
the_stats = pd.merge(the_size, the_mean)
the_stats = pd.merge(the_stats, the_std)
the_stats = pd.merge(the_stats, the_median)

the_max = pd.DataFrame(grouped.max()).reset_index()
the_max.columns = [group_column, '%s_max' % target_column]
the_min = pd.DataFrame(grouped.min()).reset_index()
the_min.columns = [group_column, '%s_min' % target_column]

the_stats = pd.merge(the_stats, the_max)
the_stats = pd.merge(the_stats, the_min)

all_df = pd.merge(all_df, the_stats)

selected_train = all_df[all_df['train'] == 1]
selected_test = all_df[all_df['train'] == 0]
selected_train.sort_values('row_id', inplace=True)
selected_test.sort_values('row_id', inplace=True)
selected_train.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)
selected_test.drop([target_column, group_column, 'row_id', 'train'], axis=1, inplace=True)

return np.array(selected_train), np.array(selected_test)

selected_manager_id_proj = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'bad_addr', 'listing_id',
'month', 'day', 'weekday', 'day_of_year', 'hour', 'num_features', 'num_desc',
'bed_to_bath', 'price_per_bed', 'price_per_bath', 'bldg_count', 'zero_bldg', 'total_room', 'room_diff',
'photo_count', 'latitude_grid', 'longitude_grid', 'lat_long_grid']

for target_col in selected_manager_id_proj:
tmp_train, tmp_test = get_stats(train_df, test_df, target_column=target_col)
train_stack_list.append(tmp_train)
test_stack_list.append(tmp_test)

selected_bedrooms_proj = ['price', 'listing_id', 'month', 'day', 'weekday', 'day_of_year', 'hour', 'num_features', 'bldg_count', 'zero_bldg']

for target_col in selected_bedrooms_proj:
tmp_train, tmp_test = get_stats(train_df, test_df, target_column=target_col, group_column='bedrooms')
train_stack_list.append(tmp_train)
test_stack_list.append(tmp_test)

for c in ['bathrooms', 'bedrooms','zero_bldg', 'latitude_grid', 'longitude_grid', 'lat_long_grid', 'manager_id', 'building_id']:
tmp_train, tmp_test = get_label_encoder(c, train_df = train_df, test_df = test_df)
train_fea_list.append(tmp_train)
test_fea_list.append(tmp_test)
for target_col in ['price', 'num_features', 'listing_id', 'bedrooms', 'bathrooms']:
for group_col in ["cluster_1", "cluster_2", "street_address", "manager_id"]:
tmp_train, tmp_test = get_label_inter_stats(train_df, test_df, target_column=target_col, group_column=group_col)
train_fea_list.append(tmp_train)
test_fea_list.append(tmp_test)

metafeature

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import numpy as np
from scipy import sparse as ssp
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
def make_mf_classification(X ,y, clf, X_test, n_folds=5,seed=1024,nb_epoch=50,max_features=0.75,name='xgb',path=''):
n = X.shape[0]
'''
Fit metafeature by @clf and get prediction for test. Assumed that @clf -- classifier
'''
print clf
np.random.seed(seed)
feature_index = np.arange(X.shape[1])
for epoch in range(nb_epoch):
print "Start epoch:",epoch
mf_tr = np.zeros((X.shape[0],len(np.unique(y))))
mf_te = np.zeros((X_test.shape[0],len(np.unique(y))))
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed).split(X,y)
np.random.shuffle(feature_index)
new_index = feature_index[:int(max_features*len(feature_index))]
for ind_tr, ind_te in skf:
if ssp.issparse(X):
X_tr = X[ind_tr].tocsc()[:,new_index]
X_te = X[ind_te].tocsc()[:,new_index]
else:
X_tr = X[ind_tr][:,new_index]
X_te = X[ind_te][:,new_index]
y_tr = y[ind_tr]
y_te = y[ind_te]
clf.fit(X_tr, y_tr)
mf_tr[ind_te] += clf.predict_proba(X_te)
mf_te += clf.predict_proba(X_test[:,new_index])
score = log_loss(y_te, mf_tr[ind_te])
print '\tpred[{}] score:{}'.format(epoch, score)
mf_te/=n_folds
pd.to_pickle(mf_tr,path+'X_mf_%s_%s_random.pkl'%(name,epoch))
pd.to_pickle(mf_te,path+'X_t_mf_%s_%s_random.pkl'%(name,epoch))
clf = LogisticRegression(C=0.5,dual=True,random_state=seed)
make_mf_classification(X ,y, clf, X_t, n_folds=5,seed=seed,nb_epoch=50,max_features=0.75,name='lr',path=path)
-------------本文结束感谢您的阅读-------------
;